diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
index 83111a1c752b09..2ce1dc14c3b0da 100644
--- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.cpp
@@ -12,8 +12,6 @@
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 
-#include <iostream>
-
 using namespace clang::ast_matchers;
 
 namespace clang {
@@ -132,6 +130,12 @@ void ConstCorrectnessCheck::check(const MatchFinder::MatchResult &Result) {
     VC = VariableCategory::Reference;
   if (Variable->getType()->isPointerType())
     VC = VariableCategory::Pointer;
+  if (Variable->getType()->isArrayType()) {
+    if (const auto *ArrayT = dyn_cast<ArrayType>(Variable->getType())) {
+      if (ArrayT->getElementType()->isPointerType())
+        VC = VariableCategory::Pointer;
+    }
+  }
 
   // Each variable can only be in one category: Value, Pointer, Reference.
   // Analysis can be controlled for every category.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp
index 8a267eb04a1516..9a150e887234d9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp
@@ -10,4 +10,65 @@ void potential_const_pointer() {
   double *p_local0 = &np_local0[1];
   // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'double *' can be declared 'const'
   // CHECK-FIXES: double *const p_local0
+
+  using doublePtr = double*;
+  using doubleArray = double[15];
+  doubleArray np_local1;
+  doublePtr p_local1 = &np_local1[0];
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local1' of type 'doublePtr' (aka 'double *') can be declared 'const'
+  // CHECK-FIXES: doublePtr const p_local1
+}
+
+void range_for() {
+  int np_local0[2] = {1, 2};
+  int *p_local0[2] = {&np_local0[0], &np_local0[1]};
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int *[2]' can be declared 'const'
+  // CHECK-FIXES: int *const p_local0[2]
+  for (const int *p_local1 : p_local0) {
+  // CHECK-MESSAGES: [[@LINE-1]]:8: warning: variable 'p_local1' of type 'const int *' can be declared 'const'
+  // CHECK-FIXES: for (const int *const p_local1 : p_local0)
+  }
+
+  int *p_local2[2] = {nullptr, nullptr};
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local2' of type 'int *[2]' can be declared 'const'
+  // CHECK-FIXES: int *const p_local2[2]
+  for (const auto *con_ptr : p_local2) {
+  }
+
+}
+
+template <typename T>
+struct SmallVectorBase {
+  T data[4];
+  void push_back(const T &el) {}
+  int size() const { return 4; }
+  T *begin() { return data; }
+  const T *begin() const { return data; }
+  T *end() { return data + 4; }
+  const T *end() const { return data + 4; }
+};
+
+template <typename T>
+struct SmallVector : SmallVectorBase<T> {};
+
+template <class T>
+void EmitProtocolMethodList(T &&Methods) {
+  // Note: If the template is uninstantiated the analysis does not figure out,
+  // that p_local0 could be const. Not sure why, but probably bails because
+  // some expressions are type-dependent.
+  SmallVector<const int *> p_local0;
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'SmallVector<const int *>' can be declared 'const'
+  // CHECK-FIXES: SmallVector<const int *> const p_local0
+  SmallVector<const int *> np_local0;
+  for (const auto *I : Methods) {
+    if (I == nullptr)
+      np_local0.push_back(I);
+  }
+  p_local0.size();
+}
+void instantiate() {
+  int *p_local0[4] = {nullptr, nullptr, nullptr, nullptr};
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int *[4]' can be declared 'const'
+  // CHECK-FIXES: int *const p_local0[4]
+  EmitProtocolMethodList(p_local0);
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
index 8acb0bd6b42c42..f469bfd055c932 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
@@ -526,18 +526,13 @@ void range_for() {
   // CHECK-FIXES: int const p_local1[2]
   for (const int &const_ref : p_local1) {
   }
+}
 
-  int *p_local2[2] = {&np_local0[0], &np_local0[1]};
-  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local2' of type 'int *[2]' can be declared 'const'
-  // CHECK-FIXES: int *const p_local2[2]
-  for (const int *con_ptr : p_local2) {
-  }
-
-  int *p_local3[2] = {nullptr, nullptr};
-  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local3' of type 'int *[2]' can be declared 'const'
-  // CHECK-FIXES: int *const p_local3[2]
-  for (const auto *con_ptr : p_local3) {
-  }
+void arrays_of_pointers_are_ignored() {
+  int *np_local0[2] = {nullptr, nullptr};
+  
+  using intPtr = int*;
+  intPtr np_local1[2] = {nullptr, nullptr};
 }
 
 inline void *operator new(decltype(sizeof(void *)), void *p) { return p; }
@@ -908,41 +903,6 @@ void vlas() {
   sizeof(int[++N]);
 }
 
-template <typename T>
-struct SmallVectorBase {
-  T data[4];
-  void push_back(const T &el) {}
-  int size() const { return 4; }
-  T *begin() { return data; }
-  const T *begin() const { return data; }
-  T *end() { return data + 4; }
-  const T *end() const { return data + 4; }
-};
-
-template <typename T>
-struct SmallVector : SmallVectorBase<T> {};
-
-template <class T>
-void EmitProtocolMethodList(T &&Methods) {
-  // Note: If the template is uninstantiated the analysis does not figure out,
-  // that p_local0 could be const. Not sure why, but probably bails because
-  // some expressions are type-dependent.
-  SmallVector<const int *> p_local0;
-  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'SmallVector<const int *>' can be declared 'const'
-  // CHECK-FIXES: SmallVector<const int *> const p_local0
-  SmallVector<const int *> np_local0;
-  for (const auto *I : Methods) {
-    if (I == nullptr)
-      np_local0.push_back(I);
-  }
-  p_local0.size();
-}
-void instantiate() {
-  int *p_local0[4] = {nullptr, nullptr, nullptr, nullptr};
-  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'int *[4]' can be declared 'const'
-  // CHECK-FIXES: int *const p_local0[4]
-  EmitProtocolMethodList(p_local0);
-}
 struct base {
   int member;
 };
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
index 220aee759a935c..a595d517cd2768 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
@@ -214,8 +214,14 @@ struct NodeBuilderContext {
   const CFGBlock *Block;
   const LocationContext *LC;
 
+  NodeBuilderContext(const CoreEngine &E, const CFGBlock *B,
+                     const LocationContext *L)
+      : Eng(E), Block(B), LC(L) {
+    assert(B);
+  }
+
   NodeBuilderContext(const CoreEngine &E, const CFGBlock *B, ExplodedNode *N)
-      : Eng(E), Block(B), LC(N->getLocationContext()) { assert(B); }
+      : NodeBuilderContext(E, B, N->getLocationContext()) {}
 
   /// Return the CFGBlock associated with this builder.
   const CFGBlock *getBlock() const { return Block; }
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index a905f9097750d0..848e43d15fbff4 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -732,6 +732,7 @@ class ExprEngine {
   /// A multi-dimensional array is also a continuous memory location in a
   /// row major order, so for arr[0][0] Idx is 0 and for arr[2][2] Idx is 8.
   SVal computeObjectUnderConstruction(const Expr *E, ProgramStateRef State,
+                                      const NodeBuilderContext *BldrCtx,
                                       const LocationContext *LCtx,
                                       const ConstructionContext *CC,
                                       EvalCallOptions &CallOpts,
@@ -748,13 +749,13 @@ class ExprEngine {
 
   /// A convenient wrapper around computeObjectUnderConstruction
   /// and updateObjectsUnderConstruction.
-  std::pair<ProgramStateRef, SVal>
-  handleConstructionContext(const Expr *E, ProgramStateRef State,
-                            const LocationContext *LCtx,
-                            const ConstructionContext *CC,
-                            EvalCallOptions &CallOpts, unsigned Idx = 0) {
+  std::pair<ProgramStateRef, SVal> handleConstructionContext(
+      const Expr *E, ProgramStateRef State, const NodeBuilderContext *BldrCtx,
+      const LocationContext *LCtx, const ConstructionContext *CC,
+      EvalCallOptions &CallOpts, unsigned Idx = 0) {
 
-    SVal V = computeObjectUnderConstruction(E, State, LCtx, CC, CallOpts, Idx);
+    SVal V = computeObjectUnderConstruction(E, State, BldrCtx, LCtx, CC,
+                                            CallOpts, Idx);
     State = updateObjectsUnderConstruction(V, E, State, LCtx, CC, CallOpts);
 
     return std::make_pair(State, V);
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 6f6bad1052a15a..63027413f37aa1 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -370,6 +370,7 @@ CGDebugInfo::computeChecksum(FileID FID, SmallString<64> &Checksum) const {
     llvm::toHex(llvm::SHA256::hash(Data), /*LowerCase=*/true, Checksum);
     return llvm::DIFile::CSK_SHA256;
   }
+  llvm_unreachable("Unhandled DebugSrcHashKind enum");
 }
 
 Optional<StringRef> CGDebugInfo::getSource(const SourceManager &SM,
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 57591960a14017..a8b49adfb4c9a6 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -485,9 +485,9 @@ CallEvent::getReturnValueUnderConstruction() const {
 
   EvalCallOptions CallOpts;
   ExprEngine &Engine = getState()->getStateManager().getOwningEngine();
-  SVal RetVal =
-    Engine.computeObjectUnderConstruction(getOriginExpr(), getState(),
-                                          getLocationContext(), CC, CallOpts);
+  SVal RetVal = Engine.computeObjectUnderConstruction(
+      getOriginExpr(), getState(), &Engine.getBuilderContext(),
+      getLocationContext(), CC, CallOpts);
   return RetVal;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index ae878ecbcc34a7..476afc598ac6cd 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -111,9 +111,15 @@ SVal ExprEngine::makeElementRegion(ProgramStateRef State, SVal LValue,
   return LValue;
 }
 
+// In case when the prvalue is returned from the function (kind is one of
+// SimpleReturnedValueKind, CXX17ElidedCopyReturnedValueKind), then
+// it's materialization happens in context of the caller.
+// We pass BldrCtx explicitly, as currBldrCtx always refers to callee's context.
 SVal ExprEngine::computeObjectUnderConstruction(
-    const Expr *E, ProgramStateRef State, const LocationContext *LCtx,
-    const ConstructionContext *CC, EvalCallOptions &CallOpts, unsigned Idx) {
+    const Expr *E, ProgramStateRef State, const NodeBuilderContext *BldrCtx,
+    const LocationContext *LCtx, const ConstructionContext *CC,
+    EvalCallOptions &CallOpts, unsigned Idx) {
+
   SValBuilder &SVB = getSValBuilder();
   MemRegionManager &MRMgr = SVB.getRegionManager();
   ASTContext &ACtx = SVB.getContext();
@@ -210,8 +216,11 @@ SVal ExprEngine::computeObjectUnderConstruction(
           CallerLCtx = CallerLCtx->getParent();
           assert(!isa<BlockInvocationContext>(CallerLCtx));
         }
+
+        NodeBuilderContext CallerBldrCtx(getCoreEngine(),
+                                         SFC->getCallSiteBlock(), CallerLCtx);
         return computeObjectUnderConstruction(
-            cast<Expr>(SFC->getCallSite()), State, CallerLCtx,
+            cast<Expr>(SFC->getCallSite()), State, &CallerBldrCtx, CallerLCtx,
             RTC->getConstructionContext(), CallOpts);
       } else {
         // We are on the top frame of the analysis. We do not know where is the
@@ -251,7 +260,7 @@ SVal ExprEngine::computeObjectUnderConstruction(
       EvalCallOptions PreElideCallOpts = CallOpts;
 
       SVal V = computeObjectUnderConstruction(
-          TCC->getConstructorAfterElision(), State, LCtx,
+          TCC->getConstructorAfterElision(), State, BldrCtx, LCtx,
           TCC->getConstructionContextAfterElision(), CallOpts);
 
       // FIXME: This definition of "copy elision has not failed" is unreliable.
@@ -319,7 +328,7 @@ SVal ExprEngine::computeObjectUnderConstruction(
       CallEventManager &CEMgr = getStateManager().getCallEventManager();
       auto getArgLoc = [&](CallEventRef<> Caller) -> Optional<SVal> {
         const LocationContext *FutureSFC =
-            Caller->getCalleeStackFrame(currBldrCtx->blockCount());
+            Caller->getCalleeStackFrame(BldrCtx->blockCount());
         // Return early if we are unable to reliably foresee
         // the future stack frame.
         if (!FutureSFC)
@@ -338,7 +347,7 @@ SVal ExprEngine::computeObjectUnderConstruction(
         // because this-argument is implemented as a normal argument in
         // operator call expressions but not in operator declarations.
         const TypedValueRegion *TVR = Caller->getParameterLocation(
-            *Caller->getAdjustedParameterIndex(Idx), currBldrCtx->blockCount());
+            *Caller->getAdjustedParameterIndex(Idx), BldrCtx->blockCount());
         if (!TVR)
           return None;
 
@@ -643,8 +652,8 @@ void ExprEngine::handleConstructor(const Expr *E,
     }
 
     // The target region is found from construction context.
-    std::tie(State, Target) =
-        handleConstructionContext(CE, State, LCtx, CC, CallOpts, Idx);
+    std::tie(State, Target) = handleConstructionContext(
+        CE, State, currBldrCtx, LCtx, CC, CallOpts, Idx);
     break;
   }
   case CXXConstructExpr::CK_VirtualBase: {
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
index dbfded29c1ae4f..48b5db1eb4a521 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
@@ -774,9 +774,9 @@ ProgramStateRef ExprEngine::bindReturnValue(const CallEvent &Call,
     SVal Target;
     assert(RTC->getStmt() == Call.getOriginExpr());
     EvalCallOptions CallOpts; // FIXME: We won't really need those.
-    std::tie(State, Target) =
-        handleConstructionContext(Call.getOriginExpr(), State, LCtx,
-                                  RTC->getConstructionContext(), CallOpts);
+    std::tie(State, Target) = handleConstructionContext(
+        Call.getOriginExpr(), State, currBldrCtx, LCtx,
+        RTC->getConstructionContext(), CallOpts);
     const MemRegion *TargetR = Target.getAsRegion();
     assert(TargetR);
     // Invalidate the region so that it didn't look uninitialized. If this is
diff --git a/clang/test/Analysis/copy-elision.cpp b/clang/test/Analysis/copy-elision.cpp
index dee1a5fc86e7fa..991f325c05853d 100644
--- a/clang/test/Analysis/copy-elision.cpp
+++ b/clang/test/Analysis/copy-elision.cpp
@@ -20,6 +20,7 @@
 #endif
 
 void clang_analyzer_eval(bool);
+void clang_analyzer_dump(int);
 
 namespace variable_functional_cast_crash {
 
@@ -418,3 +419,31 @@ void test_copy_elision() {
 }
 
 } // namespace address_vector_tests
+
+namespace arg_directly_from_return_in_loop {
+
+struct Result {
+  int value;
+};
+
+Result create() {
+  return Result{10};
+}
+
+int accessValue(Result r) {
+  return r.value;
+}
+
+void test() {
+  for (int i = 0; i < 3; ++i) {
+    int v = accessValue(create());
+    if (i == 0) {
+      clang_analyzer_dump(v); // expected-warning {{10 S32b}}
+    } else {
+      clang_analyzer_dump(v); // expected-warning {{10 S32b}}
+                              // was {{reg_${{[0-9]+}}<int r.value> }} for C++11
+    }
+  }
+}
+
+} // namespace arg_directly_from_return_in_loop
diff --git a/clang/test/C/drs/dr3xx.c b/clang/test/C/drs/dr3xx.c
index 795bc590f5351b..0f06118ca6e572 100644
--- a/clang/test/C/drs/dr3xx.c
+++ b/clang/test/C/drs/dr3xx.c
@@ -1,8 +1,8 @@
 /* RUN: %clang_cc1 -std=c89 -fsyntax-only -Wvla -verify=expected,c89only -pedantic -Wno-c11-extensions %s
-   RUN: %clang_cc1 -std=c99 -fsyntax-only -Wvla -verify -pedantic -Wno-c11-extensions %s
-   RUN: %clang_cc1 -std=c11 -fsyntax-only -Wvla -verify -pedantic %s
-   RUN: %clang_cc1 -std=c17 -fsyntax-only -Wvla -verify -pedantic %s
-   RUN: %clang_cc1 -std=c2x -fsyntax-only -Wvla -verify -pedantic %s
+   RUN: %clang_cc1 -std=c99 -fsyntax-only -Wvla -verify=expected,c99andup -pedantic -Wno-c11-extensions %s
+   RUN: %clang_cc1 -std=c11 -fsyntax-only -Wvla -verify=expected,c99andup -pedantic %s
+   RUN: %clang_cc1 -std=c17 -fsyntax-only -Wvla -verify=expected,c99andup -pedantic %s
+   RUN: %clang_cc1 -std=c2x -fsyntax-only -Wvla -verify=expected,c99andup -pedantic %s
  */
 
 /* The following are DRs which do not require tests to demonstrate
@@ -108,10 +108,91 @@ _Static_assert(sizeof(dr315.a + dr315.b) == sizeof(unsigned long long), ""); /*
  */
 _Static_assert(sizeof(dr315.c + dr315.d) == sizeof(int), "");
 
+#if __STDC_VERSION__ < 202000L
 /* WG14 DR316: yes
  * Unprototyped function types
  */
-#if __STDC_VERSION__ < 202000L
 void dr316_1(a) int a; {}  /* expected-warning {{a function definition without a prototype is deprecated in all versions of C and is not supported in C2x}} */
 void (*dr316_1_ptr)(int, int, int) = dr316_1;
+
+/* WG14 DR317: yes
+ * Function definitions with empty parentheses
+ *
+ * Despite the function with empty parens being a definition, this does not
+ * provide a prototype for the function. However, calling the function with
+ * arguments is undefined behavior, so it is defensible for us to warn the user
+ * about it. They key point to this DR is that we give the "without a
+ * prototype" warnings to demonstrate we don't give this function a prototype.
+ */
+void dr317_1() {}  /* expected-warning {{a function declaration without a prototype is deprecated in all versions of C}} */
+void dr317_2(void) {
+  if (0)
+    dr317_1(1); /* expected-warning {{too many arguments in call to 'dr317_1'}}
+                   expected-warning {{passing arguments to 'dr317_1' without a prototype is deprecated in all versions of C and is not supported in C2x}}
+                 */
+}
 #endif /* __STDC_VERSION__ < 202000L */
+
+/* WG14 DR320: yes
+ * Scope of variably modified type
+ */
+int dr320_v;
+typedef int dr320_t[dr320_v]; /* c89only-warning {{variable length arrays are a C99 feature}}
+                                 expected-error {{variable length array declaration not allowed at file scope}}
+                                 c99andup-warning {{variable length array used}}
+                               */
+void dr320(int okay[dr320_v]) { /* c89only-warning {{variable length arrays are a C99 feature}}
+                                   c99andup-warning {{variable length array used}}
+                                 */
+  typedef int type[dr320_v]; /* c89only-warning {{variable length arrays are a C99 feature}}
+                                c99andup-warning {{variable length array used}}
+                              */
+  extern type bad;  /* expected-error {{variable length array declaration cannot have 'extern' linkage}} */
+
+  /* C99 6.7.5.2p2, second sentence. */
+  static type fine; /* expected-error {{variable length array declaration cannot have 'static' storage duration}} */
+}
+
+/* WG14 DR321: yes
+ * Wide character code values for members of the basic character set
+ */
+#define DR321 (\
+    ' ' == L' ' && '\t' == L'\t' && '\v' == L'\v' && '\r' == L'\r' &&           \
+    '\n' == L'\n' &&                                                            \
+    'a' == L'a' && 'b' == L'b' && 'c' == L'c' && 'd' == L'd' && 'e' == L'e' &&  \
+    'f' == L'f' && 'g' == L'g' && 'h' == L'h' && 'i' == L'i' && 'j' == L'j' &&  \
+    'k' == L'k' && 'l' == L'l' && 'm' == L'm' && 'n' == L'n' && 'o' == L'o' &&  \
+    'p' == L'p' && 'q' == L'q' && 'r' == L'r' && 's' == L's' && 't' == L't' &&  \
+    'u' == L'u' && 'v' == L'v' && 'w' == L'w' && 'x' == L'x' && 'y' == L'y' &&  \
+    'z' == L'z' &&                                                              \
+    'A' == L'A' && 'B' == L'B' && 'C' == L'C' && 'D' == L'D' && 'E' == L'E' &&  \
+    'F' == L'F' && 'G' == L'G' && 'H' == L'H' && 'I' == L'I' && 'J' == L'J' &&  \
+    'K' == L'K' && 'L' == L'L' && 'M' == L'M' && 'N' == L'N' && 'O' == L'O' &&  \
+    'P' == L'P' && 'Q' == L'Q' && 'R' == L'R' && 'S' == L'S' && 'T' == L'T' &&  \
+    'U' == L'U' && 'V' == L'V' && 'W' == L'W' && 'X' == L'X' && 'Y' == L'Y' &&  \
+    'Z' == L'Z' &&                                                              \
+    '0' == L'0' && '1' == L'1' && '2' == L'2' && '3' == L'3' && '4' == L'4' &&  \
+    '5' == L'5' && '6' == L'6' && '7' == L'7' && '8' == L'8' &&                 \
+    '9' == L'9' &&                                                              \
+    '_' == L'_' && '{' == L'{' && '}' == L'}' && '[' == L'[' && ']' == L']' &&  \
+    '#' == L'#' && '(' == L'(' && ')' == L')' && '<' == L'<' && '>' == L'>' &&  \
+    '%' == L'%' && ':' == L':' && ';' == L';' && '.' == L'.' && '?' == L'?' &&  \
+    '*' == L'*' && '+' == L'+' && '-' == L'-' && '/' == L'/' && '^' == L'^' &&  \
+    '&' == L'&' && '|' == L'|' && '~' == L'~' && '!' == L'!' && '=' == L'=' &&  \
+    ',' == L',' && '\\' == L'\\' && '"' == L'"' && '\'' == L'\''                \
+  )
+#if __STDC_MB_MIGHT_NEQ_WC__
+#ifndef __FreeBSD__ // PR22208, FreeBSD expects us to give a bad (but conforming) answer here.
+_Static_assert(!DR321, "__STDC_MB_MIGHT_NEQ_WC__ but all basic source characters have same representation");
+#endif
+#else
+_Static_assert(DR321, "!__STDC_MB_MIGHT_NEQ_WC__ but some character differs");
+#endif
+
+/* WG14 DR328: yes
+ * String literals in compound literal initialization
+ */
+const char *dr328_v = (const char *){"this is a string literal"}; /* c89only-warning {{compound literals are a C99-specific feature}} */
+void dr328(void) {
+  const char *val = (const char *){"also a string literal"}; /* c89only-warning {{compound literals are a C99-specific feature}} */
+}
diff --git a/clang/www/c_dr_status.html b/clang/www/c_dr_status.html
index 4ef2937e7fe0f1..52966d705e765b 100644
--- a/clang/www/c_dr_status.html
+++ b/clang/www/c_dr_status.html
@@ -1855,7 +1855,7 @@ <h2 id="cdr">C defect report implementation status</h2>
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_317.htm">317</a></td>
     <td>NAD</td>
     <td>Function definitions with empty parentheses</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="318">
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_318.htm">318</a></td>
@@ -1873,13 +1873,13 @@ <h2 id="cdr">C defect report implementation status</h2>
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_320.htm">320</a></td>
     <td>C99</td>
     <td>Scope of variably modified type</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="321">
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_321.htm">321</a></td>
     <td>C99</td>
     <td>Wide character code values for members of the basic character set</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="322">
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_322.htm">322</a></td>
@@ -1921,7 +1921,7 @@ <h2 id="cdr">C defect report implementation status</h2>
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_328.htm">328</a></td>
     <td>C99</td>
     <td>String literals in compound literal initialization</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="329">
     <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/dr_329.htm">329</a></td>
diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h
index 0a8bad06779944..c7615da73039e1 100644
--- a/flang/include/flang/Lower/CallInterface.h
+++ b/flang/include/flang/Lower/CallInterface.h
@@ -161,6 +161,8 @@ class CallInterface {
     bool mayBeReadByCall() const;
     /// Is the argument INTENT(OUT)
     bool isIntentOut() const;
+    /// Does the argument have the CONTIGUOUS attribute or have explicit shape ?
+    bool mustBeMadeContiguous() const;
     /// How entity is passed by.
     PassEntityBy passBy;
     /// What is the entity (SymbolRef for callee/ActualArgument* for caller)
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index b55e2ed4b804d4..ddf8fe9bd2ccd0 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1061,6 +1061,27 @@ bool Fortran::lower::CallInterface<T>::PassedEntity::isIntentOut() const {
     return true;
   return characteristics->GetIntent() == Fortran::common::Intent::Out;
 }
+template <typename T>
+bool Fortran::lower::CallInterface<T>::PassedEntity::mustBeMadeContiguous()
+    const {
+  if (!characteristics)
+    return true;
+  const auto *dummy =
+      std::get_if<Fortran::evaluate::characteristics::DummyDataObject>(
+          &characteristics->u);
+  if (!dummy)
+    return false;
+  const auto &shapeAttrs = dummy->type.attrs();
+  using ShapeAttrs = Fortran::evaluate::characteristics::TypeAndShape::Attr;
+  if (shapeAttrs.test(ShapeAttrs::AssumedRank) ||
+      shapeAttrs.test(ShapeAttrs::AssumedShape))
+    return dummy->attrs.test(
+        Fortran::evaluate::characteristics::DummyDataObject::Attr::Contiguous);
+  if (shapeAttrs.test(ShapeAttrs::DeferredShape))
+    return false;
+  // Explicit shape arrays are contiguous.
+  return dummy->type.Rank() > 0;
+}
 
 template <typename T>
 void Fortran::lower::CallInterface<T>::determineInterface(
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 676dfa05833f8d..cebb1a2acd4cf5 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -3071,7 +3071,11 @@ class ScalarExprLowering {
   /// the creation of the temp if the actual is a variable and \p byValue is
   /// true. It handles the cases where the actual may be absent, and all of the
   /// copying has to be conditional at runtime.
-  ExtValue prepareActualToBaseAddressLike(
+  /// If the actual argument may be dynamically absent, return an additional
+  /// boolean mlir::Value that if true means that the actual argument is
+  /// present.
+  std::pair<ExtValue, llvm::Optional<mlir::Value>>
+  prepareActualToBaseAddressLike(
       const Fortran::lower::SomeExpr &expr,
       const Fortran::lower::CallerInterface::PassedEntity &arg,
       CopyOutPairs &copyOutPairs, bool byValue) {
@@ -3092,21 +3096,23 @@ class ScalarExprLowering {
         (byValue || (isArray && !Fortran::evaluate::IsSimplyContiguous(
                                     expr, converter.getFoldingContext())));
     const bool needsCopy = isStaticConstantByValue || variableNeedsCopy;
-    auto argAddr = [&]() -> ExtValue {
+    auto [argAddr, isPresent] =
+        [&]() -> std::pair<ExtValue, llvm::Optional<mlir::Value>> {
       if (!actualArgIsVariable && !needsCopy)
         // Actual argument is not a variable. Make sure a variable address is
         // not passed.
-        return genTempExtAddr(expr);
+        return {genTempExtAddr(expr), llvm::None};
       ExtValue baseAddr;
       if (arg.isOptional() && Fortran::evaluate::MayBePassedAsAbsentOptional(
                                   expr, converter.getFoldingContext())) {
         auto [actualArgBind, isPresent] = prepareActualThatMayBeAbsent(expr);
         const ExtValue &actualArg = actualArgBind;
         if (!needsCopy)
-          return actualArg;
+          return {actualArg, isPresent};
 
         if (isArray)
-          return genCopyIn(actualArg, arg, copyOutPairs, isPresent, byValue);
+          return {genCopyIn(actualArg, arg, copyOutPairs, isPresent, byValue),
+                  isPresent};
         // Scalars, create a temp, and use it conditionally at runtime if
         // the argument is present.
         ExtValue temp =
@@ -3127,25 +3133,26 @@ class ScalarExprLowering {
                   builder.create<fir::ResultOp>(loc, absent);
                 })
                 .getResults()[0];
-        return fir::substBase(temp, selectAddr);
+        return {fir::substBase(temp, selectAddr), isPresent};
       }
       // Actual cannot be absent, the actual argument can safely be
       // copied-in/copied-out without any care if needed.
       if (isArray) {
         ExtValue box = genBoxArg(expr);
         if (needsCopy)
-          return genCopyIn(box, arg, copyOutPairs,
-                           /*restrictCopyAtRuntime=*/llvm::None, byValue);
+          return {genCopyIn(box, arg, copyOutPairs,
+                            /*restrictCopyAtRuntime=*/llvm::None, byValue),
+                  llvm::None};
         // Contiguous: just use the box we created above!
         // This gets "unboxed" below, if needed.
-        return box;
+        return {box, llvm::None};
       }
       // Actual argument is a non-optional, non-pointer, non-allocatable
       // scalar.
       ExtValue actualArg = genExtAddr(expr);
       if (needsCopy)
-        return createInMemoryScalarCopy(builder, loc, actualArg);
-      return actualArg;
+        return {createInMemoryScalarCopy(builder, loc, actualArg), llvm::None};
+      return {actualArg, llvm::None};
     }();
     // Scalar and contiguous expressions may be lowered to a fir.box,
     // either to account for potential polymorphism, or because lowering
@@ -3154,7 +3161,7 @@ class ScalarExprLowering {
     // is passed, not one of the dynamic type), and the expr is known to
     // be simply contiguous, so it is safe to unbox it and pass the
     // address without making a copy.
-    return readIfBoxValue(argAddr);
+    return {readIfBoxValue(argAddr), isPresent};
   }
 
   /// Lower a non-elemental procedure reference.
@@ -3264,7 +3271,8 @@ class ScalarExprLowering {
         const bool byValue = arg.passBy == PassBy::BaseAddressValueAttribute ||
                              arg.passBy == PassBy::CharBoxValueAttribute;
         ExtValue argAddr =
-            prepareActualToBaseAddressLike(*expr, arg, copyOutPairs, byValue);
+            prepareActualToBaseAddressLike(*expr, arg, copyOutPairs, byValue)
+                .first;
         if (arg.passBy == PassBy::BaseAddress ||
             arg.passBy == PassBy::BaseAddressValueAttribute) {
           caller.placeInput(arg, fir::getBase(argAddr));
@@ -3294,13 +3302,49 @@ class ScalarExprLowering {
           caller.placeInput(arg, boxChar);
         }
       } else if (arg.passBy == PassBy::Box) {
-        // Before lowering to an address, handle the allocatable/pointer actual
-        // argument to optional fir.box dummy. It is legal to pass
-        // unallocated/disassociated entity to an optional. In this case, an
-        // absent fir.box must be created instead of a fir.box with a null value
-        // (Fortran 2018 15.5.2.12 point 1).
-        if (arg.isOptional() && Fortran::evaluate::IsAllocatableOrPointerObject(
-                                    *expr, converter.getFoldingContext())) {
+        if (arg.mustBeMadeContiguous() &&
+            !Fortran::evaluate::IsSimplyContiguous(
+                *expr, converter.getFoldingContext())) {
+          // If the expression is a PDT, or a polymorphic entity, or an assumed
+          // rank, it cannot currently be safely handled by
+          // prepareActualToBaseAddressLike that is intended to prepare
+          // arguments that can be passed as simple base address.
+          if (auto dynamicType = expr->GetType())
+            if (dynamicType->IsPolymorphic())
+              TODO(loc, "passing a polymorphic entity to an OPTIONAL "
+                        "CONTIGUOUS argument");
+          if (fir::isRecordWithTypeParameters(
+                  fir::unwrapSequenceType(fir::unwrapPassByRefType(argTy))))
+            TODO(loc, "passing to an OPTIONAL CONTIGUOUS derived type argument "
+                      "with length parameters");
+          if (Fortran::evaluate::IsAssumedRank(*expr))
+            TODO(loc, "passing an assumed rank entity to an OPTIONAL "
+                      "CONTIGUOUS argument");
+          // Assumed shape VALUE are currently TODO in the call interface
+          // lowering.
+          const bool byValue = false;
+          auto [argAddr, isPresentValue] =
+              prepareActualToBaseAddressLike(*expr, arg, copyOutPairs, byValue);
+          mlir::Value box = builder.createBox(loc, argAddr);
+          if (isPresentValue) {
+            mlir::Value convertedBox = builder.createConvert(loc, argTy, box);
+            auto absent = builder.create<fir::AbsentOp>(loc, argTy);
+            caller.placeInput(arg,
+                              builder.create<mlir::arith::SelectOp>(
+                                  loc, *isPresentValue, convertedBox, absent));
+          } else {
+            caller.placeInput(arg, builder.createBox(loc, argAddr));
+          }
+
+        } else if (arg.isOptional() &&
+                   Fortran::evaluate::IsAllocatableOrPointerObject(
+                       *expr, converter.getFoldingContext())) {
+          // Before lowering to an address, handle the allocatable/pointer
+          // actual argument to optional fir.box dummy. It is legal to pass
+          // unallocated/disassociated entity to an optional. In this case, an
+          // absent fir.box must be created instead of a fir.box with a null
+          // value (Fortran 2018 15.5.2.12 point 1).
+          //
           // Note that passing an absent allocatable to a non-allocatable
           // optional dummy argument is illegal (15.5.2.12 point 3 (8)). So
           // nothing has to be done to generate an absent argument in this case,
diff --git a/flang/test/Lower/dummy-argument-assumed-shape-optional.f90 b/flang/test/Lower/dummy-argument-assumed-shape-optional.f90
new file mode 100644
index 00000000000000..94d0fac4be87b3
--- /dev/null
+++ b/flang/test/Lower/dummy-argument-assumed-shape-optional.f90
@@ -0,0 +1,377 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+module tests
+interface
+  subroutine takes_contiguous(a)
+    real, contiguous :: a(:)
+  end subroutine
+  subroutine takes_contiguous_optional(a)
+    real, contiguous, optional :: a(:)
+  end subroutine
+end interface
+
+contains
+
+! -----------------------------------------------------------------------------
+!     Test passing assumed shapes to contiguous assumed shapes
+! -----------------------------------------------------------------------------
+! Base case.
+
+subroutine test_assumed_shape_to_contiguous(x)
+  real :: x(:)
+  call takes_contiguous(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_to_contiguous(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+! CHECK:  %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box<none>) -> i1
+! CHECK:  %[[VAL_3:.*]] = fir.if %[[VAL_2]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:    %[[VAL_4:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:    fir.result %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  } else {
+! CHECK:    %[[VAL_7:.*]] = fir.allocmem !fir.array<?xf32>
+! CHECK:    fir.do_loop {{.*}} {
+              ! ... copy
+! CHECK:    }
+! CHECK:    fir.result %[[VAL_7]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  %[[VAL_20:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_20]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_22:.*]] = arith.constant false
+! CHECK:  %[[VAL_23:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_22]] : i1
+! CHECK:  %[[VAL_24:.*]] = fir.shape %[[VAL_21]]#1 : (index) -> !fir.shape<1>
+! CHECK:  %[[VAL_25:.*]] = fir.embox %[[VAL_3]](%[[VAL_24]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous(%[[VAL_25]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  fir.if %[[VAL_23]] {
+! CHECK:    fir.do_loop {{.*}} {
+              ! ... copy
+! CHECK:    }
+! CHECK:    fir.freemem %[[VAL_3]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  return
+! CHECK:}
+
+subroutine test_assumed_shape_contiguous_to_contiguous(x)
+  real, contiguous :: x(:)
+  call takes_contiguous(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_contiguous_to_contiguous(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous}) {
+! CHECK:  %[[VAL_1:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_4:.*]] = arith.constant 1 : index
+! CHECK:  %[[VAL_5:.*]] = fir.shape_shift %[[VAL_4]], %[[VAL_3]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:  %[[VAL_6:.*]] = fir.embox %[[VAL_1]](%[[VAL_5]]) : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous(%[[VAL_6]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK-NEXT:  return
+
+subroutine test_assumed_shape_opt_to_contiguous(x)
+  real, optional :: x(:)
+  call takes_contiguous(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_opt_to_contiguous(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+! CHECK:  %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box<none>) -> i1
+! CHECK:  %[[VAL_3:.*]] = fir.if %[[VAL_2]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:    %[[VAL_4:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:    fir.result %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  } else {
+! CHECK:    %[[VAL_7:.*]] = fir.allocmem !fir.array<?xf32>
+! CHECK:    fir.do_loop {{.*}} {
+              ! ... copy
+! CHECK:    }
+! CHECK:    fir.result %[[VAL_7]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  %[[VAL_20:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_20]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_22:.*]] = arith.constant false
+! CHECK:  %[[VAL_23:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_22]] : i1
+! CHECK:  %[[VAL_24:.*]] = fir.shape %[[VAL_21]]#1 : (index) -> !fir.shape<1>
+! CHECK:  %[[VAL_25:.*]] = fir.embox %[[VAL_3]](%[[VAL_24]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous(%[[VAL_25]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  fir.if %[[VAL_23]] {
+! CHECK:    fir.do_loop {{.*}} {
+              ! ... copy
+! CHECK:    }
+! CHECK:    fir.freemem %[[VAL_3]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  return
+! CHECK:}
+
+subroutine test_assumed_shape_contiguous_opt_to_contiguous(x)
+  real, optional, contiguous :: x(:)
+  call takes_contiguous(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_contiguous_opt_to_contiguous(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous, fir.optional}) {
+! CHECK:  fir.call @_QPtakes_contiguous(%[[VAL_0]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK-NEXT:  return
+
+
+! -----------------------------------------------------------------------------
+!     Test passing assumed shapes to contiguous optional assumed shapes
+! -----------------------------------------------------------------------------
+! The copy-in/out must take into account the actual argument presence (which may
+! not be known until runtime).
+
+subroutine test_assumed_shape_to_contiguous_opt(x)
+  real :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+! CHECK:  %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box<none>) -> i1
+! CHECK:  %[[VAL_3:.*]] = fir.if %[[VAL_2]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:    %[[VAL_4:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:    fir.result %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  } else {
+! CHECK:    %[[VAL_7:.*]] = fir.allocmem !fir.array<?xf32>
+! CHECK:    fir.do_loop {{.*}} {
+              ! ... copy
+! CHECK:    }
+! CHECK:    fir.result %[[VAL_7]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  %[[VAL_20:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_20]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_22:.*]] = arith.constant false
+! CHECK:  %[[VAL_23:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_22]] : i1
+! CHECK:  %[[VAL_24:.*]] = fir.shape %[[VAL_21]]#1 : (index) -> !fir.shape<1>
+! CHECK:  %[[VAL_25:.*]] = fir.embox %[[VAL_3]](%[[VAL_24]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_25]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  fir.if %[[VAL_23]] {
+! CHECK:    fir.do_loop {{.*}} {
+              ! ... copy
+! CHECK:    }
+! CHECK:    fir.freemem %[[VAL_3]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  return
+! CHECK:}
+
+subroutine test_assumed_shape_contiguous_to_contiguous_opt(x)
+  real, contiguous :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_contiguous_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous}) {
+! CHECK:  %[[VAL_1:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_4:.*]] = arith.constant 1 : index
+! CHECK:  %[[VAL_5:.*]] = fir.shape_shift %[[VAL_4]], %[[VAL_3]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:  %[[VAL_6:.*]] = fir.embox %[[VAL_1]](%[[VAL_5]]) : (!fir.ref<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_6]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK-NEXT:  return
+
+subroutine test_assumed_shape_opt_to_contiguous_opt(x)
+  real, optional :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_opt_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:  %[[VAL_1:.*]] = fir.is_present %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> i1
+! CHECK:  %[[VAL_2:.*]] = fir.zero_bits !fir.ref<!fir.array<?xf32>>
+! CHECK:  %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:  %[[VAL_5:.*]] = fir.embox %[[VAL_2]](%[[VAL_4]]) : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_6:.*]] = arith.select %[[VAL_1]], %[[VAL_0]], %[[VAL_5]] : !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+! CHECK:  %[[VAL_8:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_7]]) : (!fir.box<none>) -> i1
+! CHECK:  %[[VAL_9:.*]] = fir.if %[[VAL_1]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:    %[[VAL_10:.*]] = fir.if %[[VAL_8]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:      %[[VAL_11:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:      fir.result %[[VAL_11]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:    } else {
+! CHECK:      %[[VAL_14:.*]] = fir.allocmem !fir.array<?xf32>
+! CHECK:      fir.do_loop {{.*}} {
+                ! copy ...
+! CHECK:      }
+! CHECK:      fir.result %[[VAL_14]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:    }
+! CHECK:    fir.result %[[VAL_10]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  } else {
+! CHECK:    %[[VAL_28:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+! CHECK:    fir.result %[[VAL_28]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  %[[VAL_29:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_6]], %[[VAL_29]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_31:.*]] = arith.constant false
+! CHECK:  %[[VAL_32:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_31]] : i1
+! CHECK:  %[[VAL_33:.*]] = arith.andi %[[VAL_1]], %[[VAL_32]] : i1
+! CHECK:  %[[VAL_34:.*]] = fir.shape %[[VAL_30]]#1 : (index) -> !fir.shape<1>
+! CHECK:  %[[VAL_35:.*]] = fir.embox %[[VAL_9]](%[[VAL_34]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_37:.*]] = fir.absent !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_38:.*]] = arith.select %[[VAL_1]], %[[VAL_35]], %[[VAL_37]] : !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_38]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  fir.if %[[VAL_33]] {
+! CHECK:    %[[VAL_47:.*]] = fir.do_loop {{.*}} {
+              ! copy ...
+! CHECK:    }
+! CHECK:    fir.freemem %[[VAL_9]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  return
+! CHECK:}
+
+subroutine test_assumed_shape_contiguous_opt_to_contiguous_opt(x)
+  real, contiguous, optional :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_assumed_shape_contiguous_opt_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x", fir.contiguous, fir.optional}) {
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_0]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK-NEXT:  return
+
+! -----------------------------------------------------------------------------
+!     Test passing pointers to contiguous optional assumed shapes
+! -----------------------------------------------------------------------------
+! This case is interesting because pointers may be non contiguous, and also because
+! a pointer passed to an optional assumed shape dummy is present if and only if the
+! pointer is associated (regardless of the pointer optionality).
+
+subroutine test_pointer_to_contiguous_opt(x)
+  real, pointer :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_pointer_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x"}) {
+! CHECK:  %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ptr<!fir.array<?xf32>>) -> i64
+! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : i64
+! CHECK:  %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
+! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_7:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_8:.*]]:3 = fir.box_dims %[[VAL_6]], %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
+! CHECK:  %[[VAL_10:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_9]]) : (!fir.box<none>) -> i1
+! CHECK:  %[[VAL_11:.*]] = fir.if %[[VAL_5]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:    %[[VAL_12:.*]] = fir.if %[[VAL_10]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:      %[[VAL_13:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:      fir.result %[[VAL_13]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:    } else {
+! CHECK:      %[[VAL_16:.*]] = fir.allocmem !fir.array<?xf32>
+! CHECK:      fir.do_loop {{.*}} {
+                ! copy
+! CHECK:      }
+! CHECK:      fir.result %[[VAL_16]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:    }
+! CHECK:    fir.result %[[VAL_12]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  } else {
+! CHECK:    %[[VAL_31:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+! CHECK:    fir.result %[[VAL_31]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  %[[VAL_32:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_33:.*]]:3 = fir.box_dims %[[VAL_6]], %[[VAL_32]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_34:.*]] = arith.constant false
+! CHECK:  %[[VAL_35:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_34]] : i1
+! CHECK:  %[[VAL_36:.*]] = arith.andi %[[VAL_5]], %[[VAL_35]] : i1
+! CHECK:  %[[VAL_37:.*]] = fir.shape_shift %[[VAL_8]]#0, %[[VAL_33]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:  %[[VAL_38:.*]] = fir.embox %[[VAL_11]](%[[VAL_37]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_40:.*]] = fir.absent !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_41:.*]] = arith.select %[[VAL_5]], %[[VAL_38]], %[[VAL_40]] : !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_41]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  fir.if %[[VAL_36]] {
+! CHECK:    fir.do_loop {{.*}} {
+              ! copy
+! CHECK:    }
+! CHECK:    fir.freemem %[[VAL_11]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  return
+! CHECK:}
+
+subroutine test_pointer_contiguous_to_contiguous_opt(x)
+  real, pointer, contiguous :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_pointer_contiguous_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x", fir.contiguous}) {
+! CHECK:  %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ptr<!fir.array<?xf32>>) -> i64
+! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : i64
+! CHECK:  %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
+! CHECK:  %[[VAL_6:.*]] = fir.absent !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_7]], %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_10:.*]] = fir.box_addr %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:  %[[VAL_11:.*]] = fir.shape_shift %[[VAL_9]]#0, %[[VAL_9]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:  %[[VAL_12:.*]] = fir.embox %[[VAL_10]](%[[VAL_11]]) : (!fir.ptr<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_13:.*]] = arith.select %[[VAL_5]], %[[VAL_12]], %[[VAL_6]] : !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_13]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK-NEXT:  return
+
+subroutine test_pointer_opt_to_contiguous_opt(x)
+  real, pointer, optional :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_pointer_opt_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x", fir.optional}) {
+! CHECK:  %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ptr<!fir.array<?xf32>>) -> i64
+! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : i64
+! CHECK:  %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
+! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_7:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_8:.*]]:3 = fir.box_dims %[[VAL_6]], %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
+! CHECK:  %[[VAL_10:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_9]]) : (!fir.box<none>) -> i1
+! CHECK:  %[[VAL_11:.*]] = fir.if %[[VAL_5]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:    %[[VAL_12:.*]] = fir.if %[[VAL_10]] -> (!fir.heap<!fir.array<?xf32>>) {
+! CHECK:      %[[VAL_13:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK:      fir.result %[[VAL_13]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:    } else {
+! CHECK:      %[[VAL_16:.*]] = fir.allocmem !fir.array<?xf32>
+! CHECK:      fir.do_loop {{.*}} {
+                ! copy
+! CHECK:      }
+! CHECK:      fir.result %[[VAL_16]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:    }
+! CHECK:    fir.result %[[VAL_12]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  } else {
+! CHECK:    %[[VAL_31:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+! CHECK:    fir.result %[[VAL_31]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  %[[VAL_32:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_33:.*]]:3 = fir.box_dims %[[VAL_6]], %[[VAL_32]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_34:.*]] = arith.constant false
+! CHECK:  %[[VAL_35:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_34]] : i1
+! CHECK:  %[[VAL_36:.*]] = arith.andi %[[VAL_5]], %[[VAL_35]] : i1
+! CHECK:  %[[VAL_37:.*]] = fir.shape_shift %[[VAL_8]]#0, %[[VAL_33]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:  %[[VAL_38:.*]] = fir.embox %[[VAL_11]](%[[VAL_37]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_40:.*]] = fir.absent !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_41:.*]] = arith.select %[[VAL_5]], %[[VAL_38]], %[[VAL_40]] : !fir.box<!fir.array<?xf32>>
+! CHECK:  fir.call @_QPtakes_contiguous_optional(%[[VAL_41]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  fir.if %[[VAL_36]] {
+! CHECK:    fir.do_loop {{.*}} {
+              ! copy
+! CHECK:    }
+! CHECK:    fir.freemem %[[VAL_11]] : !fir.heap<!fir.array<?xf32>>
+! CHECK:  }
+! CHECK:  return
+! CHECK:}
+
+subroutine test_pointer_contiguous_opt_to_contiguous_opt(x)
+  real, pointer, contiguous, optional :: x(:)
+  call takes_contiguous_optional(x)
+end subroutine
+! CHECK-LABEL: func.func @_QMtestsPtest_pointer_contiguous_opt_to_contiguous_opt(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "x", fir.contiguous, fir.optional}) {
+! CHECK:  %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ptr<!fir.array<?xf32>>) -> i64
+! CHECK:  %[[VAL_4:.*]] = arith.constant 0 : i64
+! CHECK:  %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
+! CHECK:  %[[VAL_6:.*]] = fir.absent !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:  %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:  %[[VAL_9:.*]]:3 = fir.box_dims %[[VAL_7]], %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>, index) -> (index, index, index)
+! CHECK:  %[[VAL_10:.*]] = fir.box_addr %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
+! CHECK:  %[[VAL_11:.*]] = fir.shape_shift %[[VAL_9]]#0, %[[VAL_9]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:  %[[VAL_12:.*]] = fir.embox %[[VAL_10]](%[[VAL_11]]) : (!fir.ptr<!fir.array<?xf32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xf32>>
+! CHECK:  %[[VAL_13:.*]] = arith.select %[[VAL_5]], %[[VAL_12]], %[[VAL_6]] : !fir.box<!fir.array<?xf32>>
+! CHECK-NEXT:  fir.call @_QPtakes_contiguous_optional(%[[VAL_13]]) : (!fir.box<!fir.array<?xf32>>) -> ()
+! CHECK:  return
+end module
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 4740c761d1114b..1b11d888bdc4f8 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -7,7 +7,7 @@ add_header_library(
   DEPENDS
     libc.src.__support.CPP.bitset
     .memory_utils.memcpy_implementation
-    .memory_utils.memset_implementation
+    .memory_utils.bzero_implementation
 )
 
 add_entrypoint_object(
@@ -65,7 +65,7 @@ add_entrypoint_object(
   HDRS
     stpncpy.h
   DEPENDS
-    .memory_utils.memset_implementation
+    .memory_utils.bzero_implementation
 )
 
 add_entrypoint_object(
diff --git a/libc/src/string/bzero.cpp b/libc/src/string/bzero.cpp
index c57c922f6eff6f..b04cca834f9867 100644
--- a/libc/src/string/bzero.cpp
+++ b/libc/src/string/bzero.cpp
@@ -8,12 +8,12 @@
 
 #include "src/string/bzero.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/memset_implementations.h"
+#include "src/string/memory_utils/bzero_implementations.h"
 
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void, bzero, (void *ptr, size_t count)) {
-  inline_memset(reinterpret_cast<char *>(ptr), 0, count);
+  inline_bzero(reinterpret_cast<char *>(ptr), count);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 6cd45ddc42ace6..d735fcfe54174a 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -5,6 +5,7 @@ add_header_library(
     utils.h
     elements.h
     bcmp_implementations.h
+    bzero_implementations.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
@@ -35,3 +36,11 @@ add_header_library(
   DEPS
     .memory_utils
 )
+
+add_header_library(
+  bzero_implementation
+  HDRS
+    bzero_implementations.h
+  DEPS
+    .memset_implementation
+)
diff --git a/libc/src/string/memory_utils/address.h b/libc/src/string/memory_utils/address.h
deleted file mode 100644
index caa71be5b1da94..00000000000000
--- a/libc/src/string/memory_utils/address.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===-- Strongly typed address with alignment and access semantics --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_COMMON_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_COMMON_H
-
-#include "src/__support/CPP/type_traits.h" // cpp::ConditionalType
-#include "src/string/memory_utils/utils.h" // is_power2
-#include <stddef.h>                        // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-namespace __llvm_libc {
-
-// Utility to enable static_assert(false) in templates.
-template <bool flag = false> static void DeferredStaticAssert(const char *msg) {
-  static_assert(flag, "compilation error");
-}
-
-// A non-coercible type to represent raw data.
-enum class ubyte : unsigned char { ZERO = 0 };
-
-// Address attribute specifying whether the underlying load / store operations
-// are temporal or non-temporal.
-enum class Temporality { TEMPORAL, NON_TEMPORAL };
-
-// Address attribute specifying whether the underlying load / store operations
-// are aligned or unaligned.
-enum class Aligned { NO, YES };
-
-// Address attribute to discriminate between readable and writable addresses.
-enum class Permission { Read, Write };
-
-// Address is semantically equivalent to a pointer but also conveys compile time
-// information that helps with instructions selection (aligned/unaligned,
-// temporal/non-temporal).
-template <size_t Alignment, Permission P, Temporality TS> struct Address {
-  static_assert(is_power2(Alignment));
-  static constexpr size_t ALIGNMENT = Alignment;
-  static constexpr Permission PERMISSION = P;
-  static constexpr Temporality TEMPORALITY = TS;
-  static constexpr bool IS_READ = P == Permission::Read;
-  static constexpr bool IS_WRITE = P == Permission::Write;
-  using PointeeType = cpp::conditional_t<!IS_WRITE, const ubyte, ubyte>;
-  using VoidType = cpp::conditional_t<!IS_WRITE, const void, void>;
-
-  Address(VoidType *ptr) : ptr_(reinterpret_cast<PointeeType *>(ptr)) {}
-
-  PointeeType *ptr() const {
-    return reinterpret_cast<PointeeType *>(
-        __builtin_assume_aligned(ptr_, ALIGNMENT));
-  }
-
-  PointeeType *const ptr_;
-
-  template <size_t ByteOffset> auto offset(size_t byte_offset) const {
-    static constexpr size_t NewAlignment = commonAlign<ByteOffset>();
-    return Address<NewAlignment, PERMISSION, TEMPORALITY>(ptr_ + byte_offset);
-  }
-
-private:
-  static constexpr size_t gcd(size_t A, size_t B) {
-    return B == 0 ? A : gcd(B, A % B);
-  }
-
-  template <size_t ByteOffset> static constexpr size_t commonAlign() {
-    constexpr size_t GCD = gcd(ByteOffset, ALIGNMENT);
-    if constexpr (is_power2(GCD))
-      return GCD;
-    else
-      return 1;
-  }
-};
-
-template <typename T> struct IsAddressType : public cpp::false_type {};
-template <size_t Alignment, Permission P, Temporality TS>
-struct IsAddressType<Address<Alignment, P, TS>> : public cpp::true_type {};
-
-// Reinterpret the address as a pointer to T.
-// This is not UB since the underlying pointer always refers to a `char` in a
-// buffer of raw data.
-template <typename T, typename AddrT> static T *as(AddrT addr) {
-  static_assert(IsAddressType<AddrT>::value);
-  return reinterpret_cast<T *>(addr.ptr());
-}
-
-// Offsets the address by a compile time amount, this allows propagating
-// alignment whenever possible.
-template <size_t ByteOffset, typename AddrT>
-static auto offsetAddr(AddrT addr) {
-  static_assert(IsAddressType<AddrT>::value);
-  return addr.template offset<ByteOffset>(ByteOffset);
-}
-
-// Offsets the address by a runtime amount but assuming that the resulting
-// address will be Alignment aligned.
-template <size_t Alignment, typename AddrT>
-static auto offsetAddrAssumeAligned(AddrT addr, size_t byte_offset) {
-  static_assert(IsAddressType<AddrT>::value);
-  return Address<Alignment, AddrT::PERMISSION, AddrT::TEMPORALITY>(addr.ptr_ +
-                                                                   byte_offset);
-}
-
-// Offsets the address by a runtime amount that is assumed to be a multiple of
-// ByteOffset. This allows to propagate the address alignment whenever possible.
-template <size_t ByteOffset, typename AddrT>
-static auto offsetAddrMultiplesOf(AddrT addr, ptrdiff_t byte_offset) {
-  static_assert(IsAddressType<AddrT>::value);
-  return addr.template offset<ByteOffset>(byte_offset);
-}
-
-// User friendly aliases for common address types.
-template <size_t Alignment>
-using SrcAddr = Address<Alignment, Permission::Read, Temporality::TEMPORAL>;
-template <size_t Alignment>
-using DstAddr = Address<Alignment, Permission::Write, Temporality::TEMPORAL>;
-template <size_t Alignment>
-using NtSrcAddr =
-    Address<Alignment, Permission::Read, Temporality::NON_TEMPORAL>;
-template <size_t Alignment>
-using NtDstAddr =
-    Address<Alignment, Permission::Write, Temporality::NON_TEMPORAL>;
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_COMMON_H
diff --git a/libc/src/string/memory_utils/algorithm.h b/libc/src/string/memory_utils/algorithm.h
deleted file mode 100644
index 6355ffe04562f7..00000000000000
--- a/libc/src/string/memory_utils/algorithm.h
+++ /dev/null
@@ -1,463 +0,0 @@
-//===-- Algorithms to compose sized memory operations ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Higher order primitives that build upon the SizedOpT facility.
-// They constitute the basic blocks for composing memory functions.
-// This file defines the following operations:
-// - Skip
-// - Tail
-// - HeadTail
-// - Loop
-// - Align
-//
-// See each class for documentation.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
-
-#include "src/string/memory_utils/address.h" // Address
-#include "src/string/memory_utils/utils.h"   // offset_to_next_aligned
-
-#include <stddef.h> // ptrdiff_t
-
-namespace __llvm_libc {
-
-// We are not yet allowed to use asserts in low level memory operations as
-// assert itself could depend on them.
-// We define this empty macro so we can enable them as soon as possible and keep
-// track of invariants.
-#define LIBC_ASSERT(COND)
-
-// An operation that allows to skip the specified amount of bytes.
-template <ptrdiff_t Bytes> struct Skip {
-  template <typename NextT> struct Then {
-    template <typename DstAddrT>
-    static inline void set(DstAddrT dst, ubyte value) {
-      static_assert(NextT::IS_FIXED_SIZE);
-      NextT::set(offsetAddr<Bytes>(dst), value);
-    }
-
-    template <typename SrcAddrT1, typename SrcAddrT2>
-    static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2) {
-      static_assert(NextT::IS_FIXED_SIZE);
-      return NextT::isDifferent(offsetAddr<Bytes>(src1),
-                                offsetAddr<Bytes>(src2));
-    }
-
-    template <typename SrcAddrT1, typename SrcAddrT2>
-    static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2) {
-      static_assert(NextT::IS_FIXED_SIZE);
-      return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
-                                offsetAddr<Bytes>(src2));
-    }
-
-    template <typename SrcAddrT1, typename SrcAddrT2>
-    static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
-                                      size_t runtime_size) {
-      static_assert(NextT::IS_RUNTIME_SIZE);
-      return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
-                                offsetAddr<Bytes>(src2), runtime_size - Bytes);
-    }
-  };
-};
-
-// Compute the address of a tail operation.
-// Because of the runtime size, we loose the alignment information.
-template <size_t Size, typename AddrT>
-static auto tailAddr(AddrT addr, size_t runtime_size) {
-  static_assert(IsAddressType<AddrT>::value);
-  return offsetAddrAssumeAligned<1>(addr, runtime_size - Size);
-}
-
-// Perform the operation on the last 'Size' bytes of the buffer.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [________XXXXXXXX___]
-//
-// Precondition: `runtime_size >= Size`.
-template <typename SizedOpT> struct Tail {
-  static_assert(SizedOpT::IS_FIXED_SIZE);
-  static constexpr bool IS_RUNTIME_SIZE = true;
-  static constexpr size_t SIZE = SizedOpT::SIZE;
-
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-    SizedOpT::copy(tailAddr<SIZE>(dst, runtime_size),
-                   tailAddr<SIZE>(src, runtime_size));
-  }
-
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-    SizedOpT::move(tailAddr<SIZE>(dst, runtime_size),
-                   tailAddr<SIZE>(src, runtime_size));
-  }
-
-  template <typename DstAddrT>
-  static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
-    SizedOpT::set(tailAddr<SIZE>(dst, runtime_size), value);
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
-                                     size_t runtime_size) {
-    return SizedOpT::isDifferent(tailAddr<SIZE>(src1, runtime_size),
-                                 tailAddr<SIZE>(src2, runtime_size));
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
-                                    size_t runtime_size) {
-    return SizedOpT::threeWayCmp(tailAddr<SIZE>(src1, runtime_size),
-                                 tailAddr<SIZE>(src2, runtime_size));
-  }
-};
-
-// Perform the operation on the first and the last `SizedOpT::Size` bytes of the
-// buffer. This is useful for overlapping operations.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [__XXXXXXXX_________]
-// [________XXXXXXXX___]
-//
-// Precondition: `runtime_size >= Size && runtime_size <= 2 x Size`.
-template <typename SizedOpT> struct HeadTail {
-  static_assert(SizedOpT::IS_FIXED_SIZE);
-  static constexpr bool IS_RUNTIME_SIZE = true;
-
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
-    SizedOpT::copy(dst, src);
-    Tail<SizedOpT>::copy(dst, src, runtime_size);
-  }
-
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
-    static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
-    // The load and store operations can be performed in any order as long as
-    // they are not interleaved. More investigations are needed to determine the
-    // best order.
-    auto head = SizedOpT::load(src);
-    auto tail = SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
-    SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail);
-    SizedOpT::store(dst, head);
-  }
-
-  template <typename DstAddrT>
-  static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
-    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
-    SizedOpT::set(dst, value);
-    Tail<SizedOpT>::set(dst, value, runtime_size);
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
-                                     size_t runtime_size) {
-    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
-    // Two strategies can be applied here:
-    // 1. Compute head and tail and compose them with a bitwise or operation.
-    // 2. Stop early if head is different.
-    // We chose the later because HeadTail operations are typically performed
-    // with sizes ranging from 4 to 256 bytes. The cost of the loads is then
-    // significantly larger than the cost of the branch.
-    if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
-      return res;
-    return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
-                                    size_t runtime_size) {
-    LIBC_ASSERT(runtime_size >= SizedOpT::SIZE &&
-                runtime_size <= 2 * SizedOpT::SIZE);
-    if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
-      return res;
-    return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
-  }
-};
-
-// Simple loop ending with a Tail operation.
-//
-// e.g. with
-// [12345678123456781234567812345678]
-// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-// [__XXXXXXXX_______________________]
-// [__________XXXXXXXX_______________]
-// [__________________XXXXXXXX_______]
-// [______________________XXXXXXXX___]
-//
-// Precondition:
-// - runtime_size >= Size
-template <typename SizedOpT> struct Loop {
-  static_assert(SizedOpT::IS_FIXED_SIZE);
-  static constexpr bool IS_RUNTIME_SIZE = true;
-  static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
-
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-    size_t offset = 0;
-    do {
-      SizedOpT::copy(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
-                     offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
-      offset += BLOCK_SIZE;
-    } while (offset < runtime_size - BLOCK_SIZE);
-    Tail<SizedOpT>::copy(dst, src, runtime_size);
-  }
-
-  // Move forward suitable when dst < src. We load the tail bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [_________________________LLLLLLLL___]
-  // [___LLLLLLLL_________________________]
-  // [_SSSSSSSS___________________________]
-  // [___________LLLLLLLL_________________]
-  // [_________SSSSSSSS___________________]
-  // [___________________LLLLLLLL_________]
-  // [_________________SSSSSSSS___________]
-  // [_______________________SSSSSSSS_____]
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-    const auto tail_value =
-        SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
-    size_t offset = 0;
-    do {
-      SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
-                     offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
-      offset += BLOCK_SIZE;
-    } while (offset < runtime_size - BLOCK_SIZE);
-    SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail_value);
-  }
-
-  // Move backward suitable when dst > src. We load the head bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [___LLLLLLLL_________________________]
-  // [_________________________LLLLLLLL___]
-  // [___________________________SSSSSSSS_]
-  // [_________________LLLLLLLL___________]
-  // [___________________SSSSSSSS_________]
-  // [_________LLLLLLLL___________________]
-  // [___________SSSSSSSS_________________]
-  // [_____SSSSSSSS_______________________]
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void move_backward(DstAddrT dst, SrcAddrT src,
-                                   size_t runtime_size) {
-    const auto head_value = SizedOpT::load(src);
-    ptrdiff_t offset = runtime_size - BLOCK_SIZE;
-    do {
-      SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
-                     offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
-      offset -= BLOCK_SIZE;
-    } while (offset >= 0);
-    SizedOpT::store(dst, head_value);
-  }
-
-  template <typename DstAddrT>
-  static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
-    size_t offset = 0;
-    do {
-      SizedOpT::set(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset), value);
-      offset += BLOCK_SIZE;
-    } while (offset < runtime_size - BLOCK_SIZE);
-    Tail<SizedOpT>::set(dst, value, runtime_size);
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
-                                     size_t runtime_size) {
-    size_t offset = 0;
-    do {
-      if (uint64_t res = SizedOpT::isDifferent(
-              offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
-              offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
-        return res;
-      offset += BLOCK_SIZE;
-    } while (offset < runtime_size - BLOCK_SIZE);
-    return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
-                                    size_t runtime_size) {
-    size_t offset = 0;
-    do {
-      if (int32_t res = SizedOpT::threeWayCmp(
-              offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
-              offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
-        return res;
-      offset += BLOCK_SIZE;
-    } while (offset < runtime_size - BLOCK_SIZE);
-    return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
-  }
-};
-
-// Aligns using a statically-sized operation, then calls the subsequent NextT
-// operation.
-//
-// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
-// Align<_16, Arg::Dst>::Then<Loop<_32>>::copy(dst, src, runtime_size);
-enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
-template <typename SizedOpT, Arg AlignOn = Arg::_1> struct Align {
-  static_assert(SizedOpT::IS_FIXED_SIZE);
-
-  template <typename NextT> struct Then {
-    static_assert(NextT::IS_RUNTIME_SIZE);
-
-    template <typename DstAddrT, typename SrcAddrT>
-    static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-      SizedOpT::copy(dst, src);
-      auto aligned = align(dst, src, runtime_size);
-      NextT::copy(aligned.arg1, aligned.arg2, aligned.size);
-    }
-
-    // Move forward suitable when dst < src. The alignment is performed with
-    // an HeadTail operation of size ∈ [Alignment, 2 x Alignment].
-    //
-    // e.g. Moving two bytes and making sure src is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-    // [____LLLLLLLL_____________________]
-    // [___________LLLLLLLL______________]
-    // [_SSSSSSSS________________________]
-    // [________SSSSSSSS_________________]
-    //
-    // e.g. Moving two bytes and making sure dst is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-    // [____LLLLLLLL_____________________]
-    // [______LLLLLLLL___________________]
-    // [_SSSSSSSS________________________]
-    // [___SSSSSSSS______________________]
-    template <typename DstAddrT, typename SrcAddrT>
-    static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
-      auto aligned_after_begin = align(dst, src, runtime_size);
-      // We move pointers forward by Size so we can perform HeadTail.
-      auto aligned = aligned_after_begin.stepForward();
-      HeadTail<SizedOpT>::move(dst, src, runtime_size - aligned.size);
-      NextT::move(aligned.arg1, aligned.arg2, aligned.size);
-    }
-
-    // Move backward suitable when dst > src. The alignment is performed with
-    // an HeadTail operation of size ∈ [Alignment, 2 x Alignment].
-    //
-    // e.g. Moving two bytes backward and making sure src is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-    // [ _________________LLLLLLLL_______]
-    // [ ___________________LLLLLLLL_____]
-    // [____________________SSSSSSSS_____]
-    // [______________________SSSSSSSS___]
-    //
-    // e.g. Moving two bytes and making sure dst is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-    // [ _______________LLLLLLLL_________]
-    // [ ___________________LLLLLLLL_____]
-    // [__________________SSSSSSSS_______]
-    // [______________________SSSSSSSS___]
-    template <typename DstAddrT, typename SrcAddrT>
-    static inline void move_backward(DstAddrT dst, SrcAddrT src,
-                                     size_t runtime_size) {
-      const auto dst_end = offsetAddrAssumeAligned<1>(dst, runtime_size);
-      const auto src_end = offsetAddrAssumeAligned<1>(src, runtime_size);
-      auto aligned_after_end = align(dst_end, src_end, 0);
-      // We move pointers back by 2 x Size so we can perform HeadTail.
-      auto aligned = aligned_after_end.stepBack().stepBack();
-      HeadTail<SizedOpT>::move(aligned.arg1, aligned.arg2, aligned.size);
-      NextT::move_backward(dst, src, runtime_size - aligned.size);
-    }
-
-    template <typename DstAddrT>
-    static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
-      SizedOpT::set(dst, value);
-      DstAddrT _(nullptr);
-      auto aligned = align(dst, _, runtime_size);
-      NextT::set(aligned.arg1, value, aligned.size);
-    }
-
-    template <typename SrcAddrT1, typename SrcAddrT2>
-    static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
-                                       size_t runtime_size) {
-      if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
-        return res;
-      auto aligned = align(src1, src2, runtime_size);
-      return NextT::isDifferent(aligned.arg1, aligned.arg2, aligned.size);
-    }
-
-    template <typename SrcAddrT1, typename SrcAddrT2>
-    static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
-                                      size_t runtime_size) {
-      if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
-        return res;
-      auto aligned = align(src1, src2, runtime_size);
-      return NextT::threeWayCmp(aligned.arg1, aligned.arg2, aligned.size);
-    }
-  };
-
-private:
-  static constexpr size_t ALIGN_OP_SIZE = SizedOpT::SIZE;
-  static_assert(ALIGN_OP_SIZE > 1);
-
-  template <typename Arg1AddrT, typename Arg2AddrT> struct Aligned {
-    Arg1AddrT arg1;
-    Arg2AddrT arg2;
-    size_t size;
-
-    Aligned stepForward() const {
-      return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, ALIGN_OP_SIZE),
-                     offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, ALIGN_OP_SIZE),
-                     size - ALIGN_OP_SIZE};
-    }
-
-    Aligned stepBack() const {
-      return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, -ALIGN_OP_SIZE),
-                     offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, -ALIGN_OP_SIZE),
-                     size + ALIGN_OP_SIZE};
-    }
-  };
-
-  template <typename Arg1AddrT, typename Arg2AddrT>
-  static auto makeAligned(Arg1AddrT arg1, Arg2AddrT arg2, size_t size) {
-    return Aligned<Arg1AddrT, Arg2AddrT>{arg1, arg2, size};
-  }
-
-  template <typename Arg1AddrT, typename Arg2AddrT>
-  static auto align(Arg1AddrT arg1, Arg2AddrT arg2, size_t runtime_size) {
-    static_assert(IsAddressType<Arg1AddrT>::value);
-    static_assert(IsAddressType<Arg2AddrT>::value);
-    if constexpr (AlignOn == Arg::_1) {
-      auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg1.ptr_);
-      return makeAligned(offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg1, offset),
-                         offsetAddrAssumeAligned<1>(arg2, offset),
-                         runtime_size - offset);
-    } else if constexpr (AlignOn == Arg::_2) {
-      auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg2.ptr_);
-      return makeAligned(offsetAddrAssumeAligned<1>(arg1, offset),
-                         offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg2, offset),
-                         runtime_size - offset);
-    } else {
-      DeferredStaticAssert("AlignOn must be either Arg::_1 or Arg::_2");
-    }
-  }
-};
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
diff --git a/libc/src/string/memory_utils/backend_aarch64.h b/libc/src/string/memory_utils/backend_aarch64.h
deleted file mode 100644
index 8077a098ff9c08..00000000000000
--- a/libc/src/string/memory_utils/backend_aarch64.h
+++ /dev/null
@@ -1,71 +0,0 @@
-//===-- Elementary operations for aarch64 ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
-
-#if !defined(LLVM_LIBC_ARCH_AARCH64)
-#include "src/string/memory_utils/backend_scalar.h"
-
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace __llvm_libc {
-
-struct Aarch64Backend : public Scalar64BitBackend {
-  static constexpr bool IS_BACKEND_TYPE = true;
-
-  template <typename T, Temporality TS, Aligned AS,
-            cpp::enable_if_t<Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline T load(const T *src) {
-    return Scalar64BitBackend::template load<T, TS, AS>(src);
-  }
-};
-
-// Implementation of the SizedOp abstraction for the set operation.
-struct Zva64 {
-  static constexpr size_t SIZE = 64;
-
-  template <typename DstAddrT>
-  static inline void set(DstAddrT dst, ubyte value) {
-#if __SIZEOF_POINTER__ == 4
-    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
-#else
-    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
-#endif
-  }
-};
-
-inline static bool hasZva() {
-  uint64_t zva_val;
-  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
-  // DC ZVA is permitted if DZP, bit [4] is zero.
-  // BS, bits [3:0] is log2 of the block size in words.
-  // So the next line checks whether the instruction is permitted and block size
-  // is 16 words (i.e. 64 bytes).
-  return (zva_val & 0b11111) == 0b00100;
-}
-
-namespace aarch64 {
-using _1 = SizedOp<Aarch64Backend, 1>;
-using _2 = SizedOp<Aarch64Backend, 2>;
-using _3 = SizedOp<Aarch64Backend, 3>;
-using _4 = SizedOp<Aarch64Backend, 4>;
-using _8 = SizedOp<Aarch64Backend, 8>;
-using _16 = SizedOp<Aarch64Backend, 16>;
-using _32 = SizedOp<Aarch64Backend, 32>;
-using _64 = SizedOp<Aarch64Backend, 64>;
-using _128 = SizedOp<Aarch64Backend, 128>;
-} // namespace aarch64
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_ARCH_AARCH64
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
diff --git a/libc/src/string/memory_utils/backend_scalar.h b/libc/src/string/memory_utils/backend_scalar.h
deleted file mode 100644
index dba36b159baa6e..00000000000000
--- a/libc/src/string/memory_utils/backend_scalar.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- Elementary operations for native scalar types ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
-
-#include "src/__support/CPP/type_traits.h" // ConditionalType, enable_if_t
-#include "src/__support/endian.h"
-
-namespace __llvm_libc {
-
-struct Scalar64BitBackend {
-  static constexpr bool IS_BACKEND_TYPE = true;
-
-  template <typename T>
-  static constexpr bool IsScalarType =
-      cpp::is_same_v<T, uint8_t> || cpp::is_same_v<T, uint16_t> ||
-      cpp::is_same_v<T, uint32_t> || cpp::is_same_v<T, uint64_t>;
-
-  template <typename T, Temporality TS, Aligned AS>
-  static inline T load(const T *src) {
-    static_assert(IsScalarType<T>);
-    static_assert(TS == Temporality::TEMPORAL,
-                  "Scalar load does not support non-temporal access");
-    return *src;
-  }
-
-  template <typename T, Temporality TS, Aligned AS>
-  static inline void store(T *dst, T value) {
-    static_assert(IsScalarType<T>);
-    static_assert(TS == Temporality::TEMPORAL,
-                  "Scalar store does not support non-temporal access");
-    *dst = value;
-  }
-
-  template <typename T> static inline T splat(ubyte value) {
-    static_assert(IsScalarType<T>);
-    return (T(~0ULL) / T(0xFF)) * T(value);
-  }
-
-  template <typename T> static inline uint64_t notEquals(T v1, T v2) {
-    static_assert(IsScalarType<T>);
-    return v1 ^ v2;
-  }
-
-  template <typename T> static inline int32_t threeWayCmp(T v1, T v2) {
-    DeferredStaticAssert("not implemented");
-  }
-
-  // Returns the type to use to consume Size bytes.
-  template <size_t Size>
-  using getNextType = cpp::conditional_t<
-      Size >= 8, uint64_t,
-      cpp::conditional_t<Size >= 4, uint32_t,
-                         cpp::conditional_t<Size >= 2, uint16_t, uint8_t>>>;
-};
-
-template <>
-int32_t inline Scalar64BitBackend::threeWayCmp<uint8_t>(uint8_t a, uint8_t b) {
-  const int16_t la = Endian::to_big_endian(a);
-  const int16_t lb = Endian::to_big_endian(b);
-  return la - lb;
-}
-template <>
-int32_t inline Scalar64BitBackend::threeWayCmp<uint16_t>(uint16_t a,
-                                                         uint16_t b) {
-  const int32_t la = Endian::to_big_endian(a);
-  const int32_t lb = Endian::to_big_endian(b);
-  return la - lb;
-}
-template <>
-int32_t inline Scalar64BitBackend::threeWayCmp<uint32_t>(uint32_t a,
-                                                         uint32_t b) {
-  const uint32_t la = Endian::to_big_endian(a);
-  const uint32_t lb = Endian::to_big_endian(b);
-  return la > lb ? 1 : la < lb ? -1 : 0;
-}
-template <>
-int32_t inline Scalar64BitBackend::threeWayCmp<uint64_t>(uint64_t a,
-                                                         uint64_t b) {
-  const uint64_t la = Endian::to_big_endian(a);
-  const uint64_t lb = Endian::to_big_endian(b);
-  return la > lb ? 1 : la < lb ? -1 : 0;
-}
-
-namespace scalar {
-using _1 = SizedOp<Scalar64BitBackend, 1>;
-using _2 = SizedOp<Scalar64BitBackend, 2>;
-using _3 = SizedOp<Scalar64BitBackend, 3>;
-using _4 = SizedOp<Scalar64BitBackend, 4>;
-using _8 = SizedOp<Scalar64BitBackend, 8>;
-using _16 = SizedOp<Scalar64BitBackend, 16>;
-using _32 = SizedOp<Scalar64BitBackend, 32>;
-using _64 = SizedOp<Scalar64BitBackend, 64>;
-using _128 = SizedOp<Scalar64BitBackend, 128>;
-} // namespace scalar
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
diff --git a/libc/src/string/memory_utils/backend_x86.h b/libc/src/string/memory_utils/backend_x86.h
deleted file mode 100644
index cfdfcdf90131c3..00000000000000
--- a/libc/src/string/memory_utils/backend_x86.h
+++ /dev/null
@@ -1,219 +0,0 @@
-//===-- Elementary operations for x86 -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
-
-#if defined(LLVM_LIBC_ARCH_X86)
-#include "src/__support/CPP/type_traits.h" // ConditionalType, enable_if_t
-#include "src/string/memory_utils/backend_scalar.h"
-
-#ifdef __SSE2__
-#include <immintrin.h>
-#endif //  __SSE2__
-
-#if defined(__SSE2__)
-#define HAS_M128 true
-#else
-#define HAS_M128 false
-#endif
-
-#if defined(__AVX2__)
-#define HAS_M256 true
-#else
-#define HAS_M256 false
-#endif
-
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-#define HAS_M512 true
-#else
-#define HAS_M512 false
-#endif
-
-namespace __llvm_libc {
-struct X86Backend : public Scalar64BitBackend {
-  static constexpr bool IS_BACKEND_TYPE = true;
-
-  // Scalar types use base class implementations.
-  template <typename T, Temporality TS, Aligned AS,
-            cpp::enable_if_t<Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline T load(const T *src) {
-    return Scalar64BitBackend::template load<T, TS, AS>(src);
-  }
-
-  // Scalar types use base class implementations.
-  template <typename T, Temporality TS, Aligned AS,
-            cpp::enable_if_t<Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline void store(T *dst, T value) {
-    Scalar64BitBackend::template store<T, TS, AS>(dst, value);
-  }
-
-  // Scalar types use base class implementations.
-  template <typename T,
-            cpp::enable_if_t<Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline uint64_t notEquals(T v1, T v2) {
-    return Scalar64BitBackend::template notEquals<T>(v1, v2);
-  }
-
-  // Scalar types use base class implementations.
-  template <typename T,
-            cpp::enable_if_t<Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline T splat(ubyte value) {
-    return Scalar64BitBackend::template splat<T>(value);
-  }
-
-  // Scalar types use base class implementations.
-  template <typename T,
-            cpp::enable_if_t<Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline int32_t threeWayCmp(T v1, T v2) {
-    return Scalar64BitBackend::template threeWayCmp<T>(v1, v2);
-  }
-
-  // X86 types are specialized below.
-  template <typename T, Temporality TS, Aligned AS,
-            cpp::enable_if_t<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline T load(const T *src);
-
-  // X86 types are specialized below.
-  template <typename T, Temporality TS, Aligned AS,
-            cpp::enable_if_t<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline void store(T *dst, T value);
-
-  // X86 types are specialized below.
-  template <typename T,
-            cpp::enable_if_t<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline T splat(ubyte value);
-
-  // X86 types are specialized below.
-  template <typename T,
-            cpp::enable_if_t<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline uint64_t notEquals(T v1, T v2);
-
-  template <typename T,
-            cpp::enable_if_t<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
-  static inline int32_t threeWayCmp(T v1, T v2) {
-    return char_diff(reinterpret_cast<char *>(&v1),
-                     reinterpret_cast<char *>(&v2), notEquals(v1, v2));
-  }
-
-  // Returns the type to use to consume Size bytes.
-  template <size_t Size>
-  using getNextType = cpp::conditional_t<
-      (HAS_M512 && Size >= 64), __m512i,
-      cpp::conditional_t<
-          (HAS_M256 && Size >= 32), __m256i,
-          cpp::conditional_t<(HAS_M128 && Size >= 16), __m128i,
-                             Scalar64BitBackend::getNextType<Size>>>>;
-
-private:
-  static inline int32_t char_diff(const char *a, const char *b, uint64_t mask) {
-    const size_t diff_index = mask == 0 ? 0 : __builtin_ctzll(mask);
-    const int16_t ca = (unsigned char)a[diff_index];
-    const int16_t cb = (unsigned char)b[diff_index];
-    return ca - cb;
-  }
-};
-
-static inline void repmovsb(void *dst, const void *src, size_t runtime_size) {
-  asm volatile("rep movsb"
-               : "+D"(dst), "+S"(src), "+c"(runtime_size)
-               :
-               : "memory");
-}
-
-#define SPECIALIZE_LOAD(T, OS, AS, INTRISIC)                                   \
-  template <> inline T X86Backend::load<T, OS, AS>(const T *src) {             \
-    return INTRISIC(const_cast<T *>(src));                                     \
-  }
-#define SPECIALIZE_STORE(T, OS, AS, INTRISIC)                                  \
-  template <> inline void X86Backend::store<T, OS, AS>(T * dst, T value) {     \
-    INTRISIC(dst, value);                                                      \
-  }
-
-#if HAS_M128
-SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_load_si128)
-SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_loadu_si128)
-SPECIALIZE_LOAD(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
-                _mm_stream_load_si128)
-// X86 non-temporal load needs aligned access
-SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_store_si128)
-SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_storeu_si128)
-SPECIALIZE_STORE(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
-                 _mm_stream_si128)
-// X86 non-temporal store needs aligned access
-template <> inline __m128i X86Backend::splat<__m128i>(ubyte value) {
-  return _mm_set1_epi8(__builtin_bit_cast(char, value));
-}
-template <>
-inline uint64_t X86Backend::notEquals<__m128i>(__m128i a, __m128i b) {
-  using T = char __attribute__((__vector_size__(16)));
-  return _mm_movemask_epi8(T(a) != T(b));
-}
-#endif // HAS_M128
-
-#if HAS_M256
-SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::YES, _mm256_load_si256)
-SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::NO, _mm256_loadu_si256)
-SPECIALIZE_LOAD(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
-                _mm256_stream_load_si256)
-// X86 non-temporal load needs aligned access
-SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::YES,
-                 _mm256_store_si256)
-SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::NO,
-                 _mm256_storeu_si256)
-SPECIALIZE_STORE(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
-                 _mm256_stream_si256)
-// X86 non-temporal store needs aligned access
-template <> inline __m256i X86Backend::splat<__m256i>(ubyte value) {
-  return _mm256_set1_epi8(__builtin_bit_cast(char, value));
-}
-template <>
-inline uint64_t X86Backend::notEquals<__m256i>(__m256i a, __m256i b) {
-  using T = char __attribute__((__vector_size__(32)));
-  return _mm256_movemask_epi8(T(a) != T(b));
-}
-#endif // HAS_M256
-
-#if HAS_M512
-SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::YES, _mm512_load_si512)
-SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::NO, _mm512_loadu_si512)
-SPECIALIZE_LOAD(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
-                _mm512_stream_load_si512)
-// X86 non-temporal load needs aligned access
-SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::YES,
-                 _mm512_store_si512)
-SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::NO,
-                 _mm512_storeu_si512)
-SPECIALIZE_STORE(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
-                 _mm512_stream_si512)
-// X86 non-temporal store needs aligned access
-template <> inline __m512i X86Backend::splat<__m512i>(ubyte value) {
-  return _mm512_broadcastb_epi8(_mm_set1_epi8(__builtin_bit_cast(char, value)));
-}
-template <>
-inline uint64_t X86Backend::notEquals<__m512i>(__m512i a, __m512i b) {
-  return _mm512_cmpneq_epi8_mask(a, b);
-}
-#endif // HAS_M512
-
-namespace x86 {
-using _1 = SizedOp<X86Backend, 1>;
-using _2 = SizedOp<X86Backend, 2>;
-using _3 = SizedOp<X86Backend, 3>;
-using _4 = SizedOp<X86Backend, 4>;
-using _8 = SizedOp<X86Backend, 8>;
-using _16 = SizedOp<X86Backend, 16>;
-using _32 = SizedOp<X86Backend, 32>;
-using _64 = SizedOp<X86Backend, 64>;
-using _128 = SizedOp<X86Backend, 128>;
-} // namespace x86
-
-} // namespace __llvm_libc
-
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
diff --git a/libc/src/string/memory_utils/backends.h b/libc/src/string/memory_utils/backends.h
deleted file mode 100644
index 6d241fa5eb2898..00000000000000
--- a/libc/src/string/memory_utils/backends.h
+++ /dev/null
@@ -1,60 +0,0 @@
-//===-- Elementary operations to compose memory primitives ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the concept of a Backend.
-// It constitutes the lowest level of the framework and is akin to instruction
-// selection. It defines how to implement aligned/unaligned,
-// temporal/non-temporal native loads and stores for a particular architecture
-// as well as efficient ways to fill and compare types.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKENDS_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKENDS_H
-
-#include "src/string/memory_utils/address.h" // Temporality, Aligned
-#include "src/string/memory_utils/sized_op.h" // SizedOp
-#include <stddef.h>                           // size_t
-#include <stdint.h>                           // uint##_t
-
-namespace __llvm_libc {
-
-// Backends must implement the following interface.
-struct NoBackend {
-  static constexpr bool IS_BACKEND_TYPE = true;
-
-  // Loads a T from `src` honoring Temporality and Alignment.
-  template <typename T, Temporality, Aligned> static T load(const T *src);
-
-  // Stores a T to `dst` honoring Temporality and Alignment.
-  template <typename T, Temporality, Aligned>
-  static void store(T *dst, T value);
-
-  // Returns a T filled with `value` bytes.
-  template <typename T> static T splat(ubyte value);
-
-  // Returns zero iff v1 == v2.
-  template <typename T> static uint64_t notEquals(T v1, T v2);
-
-  // Returns zero iff v1 == v2, a negative number if v1 < v2 and a positive
-  // number otherwise.
-  template <typename T> static int32_t threeWayCmp(T v1, T v2);
-
-  // Returns the type to use to consume Size bytes.
-  // If no type handles Size bytes at once
-  template <size_t Size> using getNextType = void;
-};
-
-} // namespace __llvm_libc
-
-// We inline all backend implementations here to simplify the build system.
-// Each file need to be guarded with the appropriate LLVM_LIBC_ARCH_XXX ifdef.
-#include "src/string/memory_utils/backend_aarch64.h"
-#include "src/string/memory_utils/backend_scalar.h"
-#include "src/string/memory_utils/backend_x86.h"
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKENDS_H
diff --git a/libc/src/string/memory_utils/bzero_implementations.h b/libc/src/string/memory_utils/bzero_implementations.h
new file mode 100644
index 00000000000000..168fdd7e531d25
--- /dev/null
+++ b/libc/src/string/memory_utils/bzero_implementations.h
@@ -0,0 +1,24 @@
+//===-- Implementation of bzero -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H
+
+#include "src/string/memory_utils/memset_implementations.h"
+
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+inline static void inline_bzero(char *dst, size_t count) {
+  inline_memset(dst, 0, count);
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H
diff --git a/libc/src/string/memory_utils/sized_op.h b/libc/src/string/memory_utils/sized_op.h
deleted file mode 100644
index 2bca50d6c56d1f..00000000000000
--- a/libc/src/string/memory_utils/sized_op.h
+++ /dev/null
@@ -1,180 +0,0 @@
-//===-- Sized Operations --------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the SizedOp struct that serves as the middle end of the
-// framework. It implements sized memory operations by breaking them down into
-// simpler types whose availability is described in the Backend. It also
-// provides a way to load and store sized chunks of memory (necessary for the
-// move operation). SizedOp are the building blocks of higher order algorithms
-// like HeadTail, Align or Loop.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_SIZED_OP_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_SIZED_OP_H
-
-#include <stddef.h> // size_t
-
-#ifndef LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE
-#define LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE                                    \
-  __has_builtin(__builtin_memcpy_inline)
-#endif // LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE
-
-#ifndef LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE
-#define LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE                                    \
-  __has_builtin(__builtin_memset_inline)
-#endif // LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE
-
-namespace __llvm_libc {
-
-template <typename Backend, size_t Size> struct SizedOp {
-  static constexpr size_t SIZE = Size;
-  // Define instantiations of SizedOp as a fixed size operation.
-  // i.e. an operation that is composable by types in algorithm.h
-  static constexpr bool IS_FIXED_SIZE = true;
-
-private:
-  static_assert(Backend::IS_BACKEND_TYPE);
-  static_assert(SIZE > 0);
-  using type = typename Backend::template getNextType<Size>;
-  static constexpr size_t TYPE_SIZE = sizeof(type);
-  static_assert(SIZE >= TYPE_SIZE);
-  static constexpr size_t NEXT_SIZE = Size - TYPE_SIZE;
-  using NextBlock = SizedOp<Backend, NEXT_SIZE>;
-
-  // Returns whether we can use an aligned operations.
-  // This is possible because the address type carries known compile-time
-  // alignment informations.
-  template <typename T, typename AddrT> static constexpr Aligned isAligned() {
-    static_assert(IsAddressType<AddrT>::value);
-    return AddrT::ALIGNMENT > 1 && AddrT::ALIGNMENT >= sizeof(T) ? Aligned::YES
-                                                                 : Aligned::NO;
-  }
-
-  // Loads a value of the current `type` from `src`.
-  // This function is responsible for extracting Temporality and Alignment from
-  // the Address type.
-  template <typename SrcAddrT> static inline auto nativeLoad(SrcAddrT src) {
-    static_assert(IsAddressType<SrcAddrT>::value && SrcAddrT::IS_READ);
-    constexpr auto AS = isAligned<type, SrcAddrT>();
-    constexpr auto TS = SrcAddrT::TEMPORALITY;
-    return Backend::template load<type, TS, AS>(as<const type>(src));
-  }
-
-  // Stores a value of the current `type` to `dst`.
-  // This function is responsible for extracting Temporality and Alignment from
-  // the Address type.
-  template <typename DstAddrT>
-  static inline void nativeStore(type value, DstAddrT dst) {
-    static_assert(IsAddressType<DstAddrT>::value && DstAddrT::IS_WRITE);
-    constexpr auto AS = isAligned<type, DstAddrT>();
-    constexpr auto TS = DstAddrT::TEMPORALITY;
-    return Backend::template store<type, TS, AS>(as<type>(dst), value);
-  }
-
-  // A well aligned POD structure to store Size bytes.
-  // This is used to implement the move operations.
-  struct Value {
-    alignas(alignof(type)) ubyte payload[Size];
-  };
-
-public:
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void copy(DstAddrT dst, SrcAddrT src) {
-    static_assert(IsAddressType<DstAddrT>::value && DstAddrT::IS_WRITE);
-    static_assert(IsAddressType<SrcAddrT>::value && SrcAddrT::IS_READ);
-    if constexpr (LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE &&
-                  DstAddrT::TEMPORALITY == Temporality::TEMPORAL &&
-                  SrcAddrT::TEMPORALITY == Temporality::TEMPORAL) {
-      // delegate optimized copy to compiler.
-      __builtin_memcpy_inline(dst.ptr(), src.ptr(), Size);
-      return;
-    }
-    nativeStore(nativeLoad(src), dst);
-    if constexpr (NEXT_SIZE > 0)
-      NextBlock::copy(offsetAddr<TYPE_SIZE>(dst), offsetAddr<TYPE_SIZE>(src));
-  }
-
-  template <typename DstAddrT, typename SrcAddrT>
-  static inline void move(DstAddrT dst, SrcAddrT src) {
-    const auto payload = nativeLoad(src);
-    if constexpr (NEXT_SIZE > 0)
-      NextBlock::move(offsetAddr<TYPE_SIZE>(dst), offsetAddr<TYPE_SIZE>(src));
-    nativeStore(payload, dst);
-  }
-
-  template <typename DstAddrT>
-  static inline void set(DstAddrT dst, ubyte value) {
-    if constexpr (LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE &&
-                  DstAddrT::TEMPORALITY == Temporality::TEMPORAL) {
-      // delegate optimized set to compiler.
-      __builtin_memset_inline(dst.ptr(), static_cast<int>(value), Size);
-      return;
-    }
-    nativeStore(Backend::template splat<type>(value), dst);
-    if constexpr (NEXT_SIZE > 0)
-      NextBlock::set(offsetAddr<TYPE_SIZE>(dst), value);
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2) {
-    const uint64_t current =
-        Backend::template notEquals<type>(nativeLoad(src1), nativeLoad(src2));
-    if constexpr (NEXT_SIZE > 0) {
-      // In the case where we cannot handle Size with single operation (e.g.
-      // Size == 3) we can either return early if current is non zero or
-      // aggregate all the operations through the bitwise or operator.
-      // We chose the later to reduce branching.
-      return current | (NextBlock::isDifferent(offsetAddr<TYPE_SIZE>(src1),
-                                               offsetAddr<TYPE_SIZE>(src2)));
-    } else {
-      return current;
-    }
-  }
-
-  template <typename SrcAddrT1, typename SrcAddrT2>
-  static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2) {
-    const auto a = nativeLoad(src1);
-    const auto b = nativeLoad(src2);
-    // If we cannot handle Size as a single operation we have two choices:
-    // - Either use Backend's threeWayCmp directly and return it is non
-    // zero.
-    //
-    //   if (int32_t res = Backend::template threeWayCmp<type>(a, b))
-    //     return res;
-    //
-    // - Or use Backend's notEquals first and use threeWayCmp only if
-    // different, the assumption here is that notEquals is faster than
-    // threeWayCmp and that we can save cycles when the Size needs to be
-    // decomposed in many sizes (e.g. Size == 7 => 4 + 2 + 1)
-    //
-    //   if (Backend::template notEquals<type>(a, b))
-    //     return Backend::template threeWayCmp<type>(a, b);
-    //
-    // We chose the former to reduce code bloat and branching.
-    if (int32_t res = Backend::template threeWayCmp<type>(a, b))
-      return res;
-    if constexpr (NEXT_SIZE > 0)
-      return NextBlock::threeWayCmp(offsetAddr<TYPE_SIZE>(src1),
-                                    offsetAddr<TYPE_SIZE>(src2));
-    return 0;
-  }
-
-  template <typename SrcAddrT> static Value load(SrcAddrT src) {
-    Value output;
-    copy(DstAddr<alignof(type)>(output.payload), src);
-    return output;
-  }
-
-  template <typename DstAddrT> static void store(DstAddrT dst, Value value) {
-    copy(dst, SrcAddr<alignof(type)>(value.payload));
-  }
-};
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_SIZED_OP_H
diff --git a/libc/src/string/stpncpy.cpp b/libc/src/string/stpncpy.cpp
index 25e916251bad89..cc4d89d8e2bbcf 100644
--- a/libc/src/string/stpncpy.cpp
+++ b/libc/src/string/stpncpy.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/string/stpncpy.h"
-#include "src/string/memory_utils/memset_implementations.h"
+#include "src/string/memory_utils/bzero_implementations.h"
 
 #include "src/__support/common.h"
 
@@ -22,7 +22,7 @@ LLVM_LIBC_FUNCTION(char *, stpncpy,
     dest[i] = src[i];
   // When n>strlen(src), n-strlen(src) \0 are appended.
   if (n > i)
-    inline_memset(dest + i, 0, n - i);
+    inline_bzero(dest + i, n - i);
   return dest + i;
 }
 
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 708475e4e97f58..b1b434dbf17235 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/bitset.h"
 #include "src/__support/common.h"
 #include "src/string/memory_utils/memcpy_implementations.h"
-#include "src/string/memory_utils/memset_implementations.h"
+#include "src/string/memory_utils/bzero_implementations.h"
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
@@ -94,7 +94,7 @@ static inline size_t strlcpy(char *__restrict dst, const char *__restrict src,
     return len;
   size_t n = len < size - 1 ? len : size - 1;
   inline_memcpy(dst, src, n);
-  inline_memset(dst + n, 0, size - n);
+  inline_bzero(dst + n, size - n);
   return len;
 }
 
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index 4d8e45d8cdce55..8f926273de5d57 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -3,8 +3,6 @@ add_libc_unittest(
   SUITE
     libc_string_unittests
   SRCS
-    address_test.cpp
-    backend_test.cpp
     elements_test.cpp
     memory_access_test.cpp
     utils_test.cpp
@@ -17,19 +15,3 @@ add_libc_unittest(
     libc.src.__support.CPP.array
     libc.src.__support.CPP.span
 )
-
-if(NOT LLVM_LIBC_FULL_BUILD)
-# Disabling this unittest in fullbuild mode as #include<sstream> is pulling an
-# incomplete pthread implementation from llvm-libc.
-add_libc_unittest(
-  algorithm_test
-  SUITE
-    libc_string_unittests
-  SRCS
-    algorithm_test.cpp
-  DEPENDS
-    libc.src.string.memory_utils.memory_utils
-    libc.src.__support.CPP.array
-    libc.src.__support.CPP.span
-)
-endif()
diff --git a/libc/test/src/string/memory_utils/address_test.cpp b/libc/test/src/string/memory_utils/address_test.cpp
deleted file mode 100644
index fe9361ba573e53..00000000000000
--- a/libc/test/src/string/memory_utils/address_test.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "utils/UnitTest/Test.h"
-#include <src/string/memory_utils/address.h>
-
-namespace __llvm_libc {
-
-TEST(LlvmLibcAddress, AliasAreAddresses) {
-  ASSERT_TRUE(IsAddressType<SrcAddr<1>>::value);
-  ASSERT_TRUE(IsAddressType<DstAddr<1>>::value);
-  ASSERT_TRUE(IsAddressType<NtSrcAddr<1>>::value);
-  ASSERT_TRUE(IsAddressType<NtDstAddr<1>>::value);
-}
-
-TEST(LlvmLibcAddress, AliasHaveRightPermissions) {
-  ASSERT_TRUE(SrcAddr<1>::IS_READ);
-  ASSERT_TRUE(NtSrcAddr<1>::IS_READ);
-  ASSERT_TRUE(DstAddr<1>::IS_WRITE);
-  ASSERT_TRUE(NtDstAddr<1>::IS_WRITE);
-}
-
-TEST(LlvmLibcAddress, AliasHaveRightSemantic) {
-  ASSERT_EQ(SrcAddr<1>::TEMPORALITY, Temporality::TEMPORAL);
-  ASSERT_EQ(DstAddr<1>::TEMPORALITY, Temporality::TEMPORAL);
-  ASSERT_EQ(NtSrcAddr<1>::TEMPORALITY, Temporality::NON_TEMPORAL);
-  ASSERT_EQ(NtDstAddr<1>::TEMPORALITY, Temporality::NON_TEMPORAL);
-}
-
-TEST(LlvmLibcAddress, AliasHaveRightAlignment) {
-  ASSERT_EQ(SrcAddr<1>::ALIGNMENT, size_t(1));
-  ASSERT_EQ(SrcAddr<4>::ALIGNMENT, size_t(4));
-}
-
-TEST(LlvmLibcAddress, NarrowAlignment) {
-  // Address 8-byte aligned, offset by 8.
-  ASSERT_EQ(offsetAddr<8>(SrcAddr<8>(nullptr)).ALIGNMENT, size_t(8));
-  // Address 16-byte aligned, offset by 4.
-  ASSERT_EQ(offsetAddr<4>(SrcAddr<16>(nullptr)).ALIGNMENT, size_t(4));
-  // Address 4-byte aligned, offset by 16.
-  ASSERT_EQ(offsetAddr<16>(SrcAddr<4>(nullptr)).ALIGNMENT, size_t(4));
-  // Address 4-byte aligned, offset by 1.
-  ASSERT_EQ(offsetAddr<1>(SrcAddr<4>(nullptr)).ALIGNMENT, size_t(1));
-  // Address 4-byte aligned, offset by 2.
-  ASSERT_EQ(offsetAddr<2>(SrcAddr<4>(nullptr)).ALIGNMENT, size_t(2));
-  // Address 4-byte aligned, offset by 6.
-  ASSERT_EQ(offsetAddr<6>(SrcAddr<4>(nullptr)).ALIGNMENT, size_t(2));
-  // Address 4-byte aligned, offset by 10.
-  ASSERT_EQ(offsetAddr<10>(SrcAddr<4>(nullptr)).ALIGNMENT, size_t(2));
-  // Address 8-byte aligned, offset by 6.
-  ASSERT_EQ(offsetAddr<6>(SrcAddr<8>(nullptr)).ALIGNMENT, size_t(2));
-}
-
-TEST(LlvmLibcAddress, OffsetAddr) {
-  ubyte a;
-  SrcAddr<1> addr(&a);
-  ASSERT_EQ((const void *)offsetAddr<4>(addr).ptr(), (const void *)(&a + 4));
-  ASSERT_EQ((const void *)offsetAddr<32>(addr).ptr(), (const void *)(&a + 32));
-}
-
-TEST(LlvmLibcAddress, AssumeAligned) {
-  SrcAddr<16> addr(nullptr);
-  ASSERT_EQ(offsetAddrAssumeAligned<8>(addr, 0).ALIGNMENT, size_t(8));
-  ASSERT_EQ(offsetAddrAssumeAligned<1>(addr, 0).ALIGNMENT, size_t(1));
-  ASSERT_EQ(offsetAddrMultiplesOf<4>(addr, 0).ALIGNMENT, size_t(4));
-  ASSERT_EQ(offsetAddrMultiplesOf<32>(addr, 0).ALIGNMENT, size_t(16));
-}
-
-TEST(LlvmLibcAddress, offsetAddrAssumeAligned) {
-  ubyte a;
-  SrcAddr<1> addr(&a);
-  ASSERT_EQ((const void *)offsetAddrAssumeAligned<1>(addr, 17).ptr(),
-            (const void *)(&a + 17));
-}
-
-TEST(LlvmLibcAddress, offsetAddrMultiplesOf) {
-  ubyte a;
-  SrcAddr<1> addr(&a);
-  ASSERT_EQ((const void *)offsetAddrMultiplesOf<4>(addr, 16).ptr(),
-            (const void *)(&a + 16));
-}
-
-} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/algorithm_test.cpp b/libc/test/src/string/memory_utils/algorithm_test.cpp
deleted file mode 100644
index d973fbcd5c19a9..00000000000000
--- a/libc/test/src/string/memory_utils/algorithm_test.cpp
+++ /dev/null
@@ -1,566 +0,0 @@
-#define LLVM_LIBC_USE_BUILTIN_MEMCPY_INLINE 0
-#define LLVM_LIBC_USE_BUILTIN_MEMSET_INLINE 0
-
-#include "utils/UnitTest/Test.h"
-#include <src/__support/CPP/array.h>
-#include <src/string/memory_utils/algorithm.h>
-#include <src/string/memory_utils/backends.h>
-
-#include <sstream>
-
-namespace __llvm_libc {
-
-struct alignas(64) Buffer : cpp::array<char, 128> {
-  bool contains(const char *ptr) const {
-    return ptr >= data() && ptr < (data() + size());
-  }
-  size_t getOffset(const char *ptr) const { return ptr - data(); }
-  void fill(char c) {
-    for (auto itr = begin(); itr != end(); ++itr)
-      *itr = c;
-  }
-};
-
-static Buffer buffer1;
-static Buffer buffer2;
-static std::ostringstream LOG;
-
-struct TestBackend {
-  static constexpr bool IS_BACKEND_TYPE = true;
-
-  template <typename T> static void log(const char *Action, const char *ptr) {
-    LOG << Action << "<" << sizeof(T) << "> ";
-    if (buffer1.contains(ptr))
-      LOG << "a[" << buffer1.getOffset(ptr) << "]";
-    else if (buffer2.contains(ptr))
-      LOG << "b[" << buffer2.getOffset(ptr) << "]";
-    LOG << "\n";
-  }
-
-  template <typename T, Temporality TS, Aligned AS>
-  static T load(const T *src) {
-    log<T>((AS == Aligned::YES ? "LdA" : "LdU"),
-           reinterpret_cast<const char *>(src));
-    return Scalar64BitBackend::load<T, TS, AS>(src);
-  }
-
-  template <typename T, Temporality TS, Aligned AS>
-  static void store(T *dst, T value) {
-    log<T>((AS == Aligned::YES ? "StA" : "StU"),
-           reinterpret_cast<const char *>(dst));
-    Scalar64BitBackend::store<T, TS, AS>(dst, value);
-  }
-
-  template <typename T> static inline T splat(ubyte value) {
-    LOG << "Splat<" << sizeof(T) << "> " << (unsigned)value << '\n';
-    return Scalar64BitBackend::splat<T>(value);
-  }
-
-  template <typename T> static inline uint64_t notEquals(T v1, T v2) {
-    LOG << "Neq<" << sizeof(T) << ">\n";
-    return Scalar64BitBackend::notEquals<T>(v1, v2);
-  }
-
-  template <typename T> static inline int32_t threeWayCmp(T v1, T v2) {
-    LOG << "Diff<" << sizeof(T) << ">\n";
-    return Scalar64BitBackend::threeWayCmp<T>(v1, v2);
-  }
-
-  template <size_t Size>
-  using getNextType = Scalar64BitBackend::getNextType<Size>;
-};
-
-struct LlvmLibcAlgorithm : public testing::Test {
-  void SetUp() override {
-    LOG = std::ostringstream();
-    LOG << '\n';
-  }
-
-  void fillEqual() {
-    buffer1.fill('a');
-    buffer2.fill('a');
-  }
-
-  void fillDifferent() {
-    buffer1.fill('a');
-    buffer2.fill('b');
-  }
-
-  const char *getTrace() {
-    trace_ = LOG.str();
-    return trace_.c_str();
-  }
-
-  const char *stripComments(const char *expected) {
-    expected_.clear();
-    std::stringstream ss(expected);
-    std::string line;
-    while (std::getline(ss, line, '\n')) {
-      const auto pos = line.find('#');
-      if (pos == std::string::npos) {
-        expected_ += line;
-      } else {
-        auto log = line.substr(0, pos);
-        while (!log.empty() && std::isspace(log.back()))
-          log.pop_back();
-        expected_ += log;
-      }
-      expected_ += '\n';
-    }
-    return expected_.c_str();
-  }
-
-  template <size_t Align = 1> SrcAddr<Align> buf1(size_t offset = 0) const {
-    return buffer1.data() + offset;
-  }
-  template <size_t Align = 1> SrcAddr<Align> buf2(size_t offset = 0) const {
-    return buffer2.data() + offset;
-  }
-  template <size_t Align = 1> DstAddr<Align> dst(size_t offset = 0) const {
-    return buffer1.data() + offset;
-  }
-  template <size_t Align = 1> SrcAddr<Align> src(size_t offset = 0) const {
-    return buffer2.data() + offset;
-  }
-
-private:
-  std::string trace_;
-  std::string expected_;
-};
-
-using _8 = SizedOp<TestBackend, 8>;
-
-///////////////////////////////////////////////////////////////////////////////
-//// Testing fixed fized forward operations
-///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////
-// Copy
-
-TEST_F(LlvmLibcAlgorithm, copy_1) {
-  SizedOp<TestBackend, 1>::copy(dst(), src());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<1> b[0]
-StU<1> a[0]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, copy_15) {
-  SizedOp<TestBackend, 15>::copy(dst(), src());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0]
-LdU<4> b[8]
-StU<4> a[8]
-LdU<2> b[12]
-StU<2> a[12]
-LdU<1> b[14]
-StU<1> a[14]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, copy_16) {
-  SizedOp<TestBackend, 16>::copy(dst(), src());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0]
-LdU<8> b[8]
-StU<8> a[8]
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Move
-
-TEST_F(LlvmLibcAlgorithm, move_1) {
-  SizedOp<TestBackend, 1>::move(dst(), src());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<1> b[0]
-StU<1> a[0]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, move_15) {
-  SizedOp<TestBackend, 15>::move(dst(), src());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-LdU<4> b[8]
-LdU<2> b[12]
-LdU<1> b[14]
-StU<1> a[14]
-StU<2> a[12]
-StU<4> a[8]
-StU<8> a[0]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, move_16) {
-  SizedOp<TestBackend, 16>::move(dst(), src());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-LdU<8> b[8]
-StU<8> a[8]
-StU<8> a[0]
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// set
-
-TEST_F(LlvmLibcAlgorithm, set_1) {
-  SizedOp<TestBackend, 1>::set(dst(), ubyte{42});
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-Splat<1> 42
-StU<1> a[0]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, set_15) {
-  SizedOp<TestBackend, 15>::set(dst(), ubyte{42});
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-Splat<8> 42
-StU<8> a[0]
-Splat<4> 42
-StU<4> a[8]
-Splat<2> 42
-StU<2> a[12]
-Splat<1> 42
-StU<1> a[14]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, set_16) {
-  SizedOp<TestBackend, 16>::set(dst(), ubyte{42});
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-Splat<8> 42
-StU<8> a[0]
-Splat<8> 42
-StU<8> a[8]
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// different
-
-TEST_F(LlvmLibcAlgorithm, different_1) {
-  fillEqual();
-  SizedOp<TestBackend, 1>::isDifferent(buf1(), buf2());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<1> a[0]
-LdU<1> b[0]
-Neq<1>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, different_15) {
-  fillEqual();
-  SizedOp<TestBackend, 15>::isDifferent(buf1(), buf2());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[0]
-LdU<8> b[0]
-Neq<8>
-LdU<4> a[8]
-LdU<4> b[8]
-Neq<4>
-LdU<2> a[12]
-LdU<2> b[12]
-Neq<2>
-LdU<1> a[14]
-LdU<1> b[14]
-Neq<1>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, different_15_no_shortcircuit) {
-  fillDifferent();
-  SizedOp<TestBackend, 15>::isDifferent(buf1(), buf2());
-  // If buffer compare isDifferent we continue to aggregate.
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[0]
-LdU<8> b[0]
-Neq<8>
-LdU<4> a[8]
-LdU<4> b[8]
-Neq<4>
-LdU<2> a[12]
-LdU<2> b[12]
-Neq<2>
-LdU<1> a[14]
-LdU<1> b[14]
-Neq<1>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, different_16) {
-  fillEqual();
-  SizedOp<TestBackend, 16>::isDifferent(buf1(), buf2());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[0]
-LdU<8> b[0]
-Neq<8>
-LdU<8> a[8]
-LdU<8> b[8]
-Neq<8>
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// three_way_cmp
-
-TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_1) {
-  fillEqual();
-  SizedOp<TestBackend, 1>::threeWayCmp(buf1(), buf2());
-  // Buffer compare equal, returning 0 and no call to Diff.
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<1> a[0]
-LdU<1> b[0]
-Diff<1>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_15) {
-  fillEqual();
-  SizedOp<TestBackend, 15>::threeWayCmp(buf1(), buf2());
-  // Buffer compare equal, returning 0 and no call to Diff.
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[0]
-LdU<8> b[0]
-Diff<8>
-LdU<4> a[8]
-LdU<4> b[8]
-Diff<4>
-LdU<2> a[12]
-LdU<2> b[12]
-Diff<2>
-LdU<1> a[14]
-LdU<1> b[14]
-Diff<1>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, three_way_cmp_neq_15_shortcircuit) {
-  fillDifferent();
-  SizedOp<TestBackend, 16>::threeWayCmp(buf1(), buf2());
-  // If buffer compare isDifferent we stop early.
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[0]
-LdU<8> b[0]
-Diff<8>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, three_way_cmp_eq_16) {
-  fillEqual();
-  SizedOp<TestBackend, 16>::threeWayCmp(buf1(), buf2());
-  // Buffer compare equal, returning 0 and no call to Diff.
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[0]
-LdU<8> b[0]
-Diff<8>
-LdU<8> a[8]
-LdU<8> b[8]
-Diff<8>
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//// Testing skip operations
-///////////////////////////////////////////////////////////////////////////////
-
-TEST_F(LlvmLibcAlgorithm, skip_and_set) {
-  Skip<11>::Then<SizedOp<TestBackend, 1>>::set(dst(), ubyte{42});
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-Splat<1> 42
-StU<1> a[11]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, skip_and_different_1) {
-  Skip<11>::Then<SizedOp<TestBackend, 1>>::isDifferent(buf1(), buf2());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<1> a[11]
-LdU<1> b[11]
-Neq<1>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, skip_and_three_way_cmp_8) {
-  Skip<11>::Then<SizedOp<TestBackend, 1>>::threeWayCmp(buf1(), buf2());
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<1> a[11]
-LdU<1> b[11]
-Diff<1>
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//// Testing tail operations
-///////////////////////////////////////////////////////////////////////////////
-
-TEST_F(LlvmLibcAlgorithm, tail_copy_8) {
-  Tail<_8>::copy(dst(), src(), 16);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[8]
-StU<8> a[8]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, tail_move_8) {
-  Tail<_8>::move(dst(), src(), 16);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[8]
-StU<8> a[8]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, tail_set_8) {
-  Tail<_8>::set(dst(), ubyte{42}, 16);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-Splat<8> 42
-StU<8> a[8]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, tail_different_8) {
-  fillEqual();
-  Tail<_8>::isDifferent(buf1(), buf2(), 16);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[8]
-LdU<8> b[8]
-Neq<8>
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, tail_three_way_cmp_8) {
-  fillEqual();
-  Tail<_8>::threeWayCmp(buf1(), buf2(), 16);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> a[8]
-LdU<8> b[8]
-Diff<8>
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//// Testing HeadTail operations
-///////////////////////////////////////////////////////////////////////////////
-
-TEST_F(LlvmLibcAlgorithm, head_tail_copy_8) {
-  HeadTail<_8>::copy(dst(), src(), 16);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0]
-LdU<8> b[8]
-StU<8> a[8]
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//// Testing Loop operations
-///////////////////////////////////////////////////////////////////////////////
-
-TEST_F(LlvmLibcAlgorithm, loop_copy_one_iteration_and_tail) {
-  Loop<_8>::copy(dst(), src(), 10);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0] # covers 0-7
-LdU<8> b[2]
-StU<8> a[2] # covers 2-9
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, loop_copy_two_iteration_and_tail) {
-  Loop<_8>::copy(dst(), src(), 17);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0] # covers 0-7
-LdU<8> b[8]
-StU<8> a[8] # covers 8-15
-LdU<8> b[9]
-StU<8> a[9] # covers 9-16
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, loop_with_one_turn_is_inefficient_but_ok) {
-  Loop<_8>::copy(dst(), src(), 8);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0] # first iteration covers 0-7
-LdU<8> b[0] # tail also covers 0-7 but since Loop is supposed to be used
-StU<8> a[0] # with a sufficient number of iterations the tail cost is amortised
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, loop_with_round_number_of_turn) {
-  Loop<_8>::copy(dst(), src(), 24);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StU<8> a[0] # first iteration covers 0-7
-LdU<8> b[8]
-StU<8> a[8] # second iteration covers 8-15
-LdU<8> b[16]
-StU<8> a[16]
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, dst_aligned_loop) {
-  Loop<_8>::copy(dst<16>(), src(), 23);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[0]
-StA<8> a[0] # store is aligned on 16B
-LdU<8> b[8]
-StA<8> a[8] # subsequent stores are aligned
-LdU<8> b[15]
-StU<8> a[15] # Tail is always unaligned
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, aligned_loop) {
-  Loop<_8>::copy(dst<16>(), src<8>(), 23);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdA<8> b[0] # load is aligned on 8B
-StA<8> a[0] # store is aligned on 16B
-LdA<8> b[8] # subsequent loads are aligned
-StA<8> a[8] # subsequent stores are aligned
-LdU<8> b[15] # Tail is always unaligned
-StU<8> a[15] # Tail is always unaligned
-)"));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//// Testing Align operations
-///////////////////////////////////////////////////////////////////////////////
-
-TEST_F(LlvmLibcAlgorithm, align_dst_copy_8) {
-  Align<_8, Arg::Dst>::Then<Loop<_8>>::copy(dst(2), src(3), 31);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[3]
-StU<8> a[2] # First store covers unaligned bytes
-LdU<8> b[9]
-StA<8> a[8] # First aligned store
-LdU<8> b[17]
-StA<8> a[16] # Subsequent stores are aligned
-LdU<8> b[25]
-StA<8> a[24] # Subsequent stores are aligned
-LdU<8> b[26]
-StU<8> a[25] # Last store covers remaining bytes
-)"));
-}
-
-TEST_F(LlvmLibcAlgorithm, align_src_copy_8) {
-  Align<_8, Arg::Src>::Then<Loop<_8>>::copy(dst(2), src(3), 31);
-  EXPECT_STREQ(getTrace(), stripComments(R"(
-LdU<8> b[3] # First load covers unaligned bytes
-StU<8> a[2]
-LdA<8> b[8] # First aligned load
-StU<8> a[7]
-LdA<8> b[16] # Subsequent loads are aligned
-StU<8> a[15]
-LdA<8> b[24] # Subsequent loads are aligned
-StU<8> a[23]
-LdU<8> b[26] # Last load covers remaining bytes
-StU<8> a[25]
-)"));
-}
-
-} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/backend_test.cpp b/libc/test/src/string/memory_utils/backend_test.cpp
deleted file mode 100644
index 72fb7c4cf53b1c..00000000000000
--- a/libc/test/src/string/memory_utils/backend_test.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//===-- Unittests for backends --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/span.h"
-#include "src/__support/architectures.h"
-#include "src/string/memory_utils/backends.h"
-#include "utils/UnitTest/Test.h"
-#include <string.h>
-
-namespace __llvm_libc {
-
-template <size_t Size> using Buffer = cpp::array<char, Size>;
-
-static char GetRandomChar() {
-  // Implementation of C++ minstd_rand seeded with 123456789.
-  // https://en.cppreference.com/w/cpp/numeric/random
-  // "Minimum standard", recommended by Park, Miller, and Stockmeyer in 1993
-  static constexpr const uint64_t a = 48271;
-  static constexpr const uint64_t c = 0;
-  static constexpr const uint64_t m = 2147483647;
-  static uint64_t seed = 123456789;
-  seed = (a * seed + c) % m;
-  return seed;
-}
-
-static void Randomize(cpp::span<char> buffer) {
-  for (auto &current : buffer)
-    current = GetRandomChar();
-}
-
-template <size_t Size> static Buffer<Size> GetRandomBuffer() {
-  Buffer<Size> buffer;
-  Randomize(buffer);
-  return buffer;
-}
-
-template <typename Backend, size_t Size> struct Conf {
-  static_assert(Backend::IS_BACKEND_TYPE);
-  using BufferT = Buffer<Size>;
-  using T = typename Backend::template getNextType<Size>;
-  static_assert(sizeof(T) == Size);
-  static constexpr size_t SIZE = Size;
-
-  static BufferT splat(ubyte value) {
-    return cpp::bit_cast<BufferT>(Backend::template splat<T>(value));
-  }
-
-  static uint64_t notEquals(const BufferT &v1, const BufferT &v2) {
-    return Backend::template notEquals<T>(cpp::bit_cast<T>(v1),
-                                          cpp::bit_cast<T>(v2));
-  }
-
-  static int32_t threeWayCmp(const BufferT &v1, const BufferT &v2) {
-    return Backend::template threeWayCmp<T>(cpp::bit_cast<T>(v1),
-                                            cpp::bit_cast<T>(v2));
-  }
-};
-
-using FunctionTypes = testing::TypeList< //
-#if defined(LLVM_LIBC_ARCH_X86)          //
-    Conf<X86Backend, 1>,                 //
-    Conf<X86Backend, 2>,                 //
-    Conf<X86Backend, 4>,                 //
-    Conf<X86Backend, 8>,                 //
-#if HAS_M128
-    Conf<X86Backend, 16>, //
-#endif
-#if HAS_M256
-    Conf<X86Backend, 32>, //
-#endif
-#if HAS_M512
-    Conf<X86Backend, 64>, //
-#endif
-#endif                           // defined(LLVM_LIBC_ARCH_X86)
-    Conf<Scalar64BitBackend, 1>, //
-    Conf<Scalar64BitBackend, 2>, //
-    Conf<Scalar64BitBackend, 4>, //
-    Conf<Scalar64BitBackend, 8>  //
-    >;
-
-TYPED_TEST(LlvmLibcMemoryBackend, splat, FunctionTypes) {
-  for (auto value : cpp::array<uint8_t, 3>{0u, 1u, 255u}) {
-    alignas(64) const auto stored =
-        ParamType::splat(cpp::bit_cast<ubyte>(value));
-    for (size_t i = 0; i < ParamType::SIZE; ++i)
-      EXPECT_EQ(cpp::bit_cast<uint8_t>(stored[i]), value);
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryBackend, notEquals, FunctionTypes) {
-  alignas(64) const auto a = GetRandomBuffer<ParamType::SIZE>();
-  EXPECT_EQ(ParamType::notEquals(a, a), uint64_t(0));
-  for (size_t i = 0; i < a.size(); ++i) {
-    alignas(64) auto b = a;
-    ++b[i];
-    EXPECT_NE(ParamType::notEquals(a, b), uint64_t(0));
-    EXPECT_NE(ParamType::notEquals(b, a), uint64_t(0));
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryBackend, threeWayCmp, FunctionTypes) {
-  alignas(64) const auto a = GetRandomBuffer<ParamType::SIZE>();
-  EXPECT_EQ(ParamType::threeWayCmp(a, a), 0);
-  for (size_t i = 0; i < a.size(); ++i) {
-    alignas(64) auto b = a;
-    ++b[i];
-    const auto cmp = memcmp(&a, &b, sizeof(a));
-    ASSERT_NE(cmp, 0);
-    if (cmp > 0) {
-      EXPECT_GT(ParamType::threeWayCmp(a, b), 0);
-      EXPECT_LT(ParamType::threeWayCmp(b, a), 0);
-    } else {
-      EXPECT_LT(ParamType::threeWayCmp(a, b), 0);
-      EXPECT_GT(ParamType::threeWayCmp(b, a), 0);
-    }
-  }
-}
-
-template <typename Backend, size_t Size, Temporality TS, Aligned AS>
-struct LoadStoreConf {
-  static_assert(Backend::IS_BACKEND_TYPE);
-  using BufferT = Buffer<Size>;
-  using T = typename Backend::template getNextType<Size>;
-  static_assert(sizeof(T) == Size);
-  static constexpr size_t SIZE = Size;
-
-  static BufferT load(const BufferT &ref) {
-    const auto *ptr = cpp::bit_cast<const T *>(ref.data());
-    const T value = Backend::template load<T, TS, AS>(ptr);
-    return cpp::bit_cast<BufferT>(value);
-  }
-
-  static void store(BufferT &ref, const BufferT value) {
-    auto *ptr = cpp::bit_cast<T *>(ref.data());
-    Backend::template store<T, TS, AS>(ptr, cpp::bit_cast<T>(value));
-  }
-};
-
-using LoadStoreTypes = testing::TypeList<                              //
-#if defined(LLVM_LIBC_ARCH_X86)                                        //
-    LoadStoreConf<X86Backend, 1, Temporality::TEMPORAL, Aligned::NO>,  //
-    LoadStoreConf<X86Backend, 1, Temporality::TEMPORAL, Aligned::YES>, //
-    LoadStoreConf<X86Backend, 2, Temporality::TEMPORAL, Aligned::NO>,  //
-    LoadStoreConf<X86Backend, 2, Temporality::TEMPORAL, Aligned::YES>, //
-    LoadStoreConf<X86Backend, 4, Temporality::TEMPORAL, Aligned::NO>,  //
-    LoadStoreConf<X86Backend, 4, Temporality::TEMPORAL, Aligned::YES>, //
-    LoadStoreConf<X86Backend, 8, Temporality::TEMPORAL, Aligned::NO>,  //
-    LoadStoreConf<X86Backend, 8, Temporality::TEMPORAL, Aligned::YES>, //
-#if HAS_M128
-    LoadStoreConf<X86Backend, 16, Temporality::TEMPORAL, Aligned::NO>,      //
-    LoadStoreConf<X86Backend, 16, Temporality::TEMPORAL, Aligned::YES>,     //
-    LoadStoreConf<X86Backend, 16, Temporality::NON_TEMPORAL, Aligned::YES>, //
-#endif
-#if HAS_M256
-    LoadStoreConf<X86Backend, 32, Temporality::TEMPORAL, Aligned::NO>,      //
-    LoadStoreConf<X86Backend, 32, Temporality::TEMPORAL, Aligned::YES>,     //
-    LoadStoreConf<X86Backend, 32, Temporality::NON_TEMPORAL, Aligned::YES>, //
-#endif
-#if HAS_M512
-    LoadStoreConf<X86Backend, 64, Temporality::TEMPORAL, Aligned::NO>,      //
-    LoadStoreConf<X86Backend, 64, Temporality::TEMPORAL, Aligned::YES>,     //
-    LoadStoreConf<X86Backend, 64, Temporality::NON_TEMPORAL, Aligned::YES>, //
-#endif
-#endif // defined(LLVM_LIBC_ARCH_X86)
-    LoadStoreConf<Scalar64BitBackend, 1, Temporality::TEMPORAL, Aligned::NO>, //
-    LoadStoreConf<Scalar64BitBackend, 1, Temporality::TEMPORAL,
-                  Aligned::YES>,                                              //
-    LoadStoreConf<Scalar64BitBackend, 2, Temporality::TEMPORAL, Aligned::NO>, //
-    LoadStoreConf<Scalar64BitBackend, 2, Temporality::TEMPORAL,
-                  Aligned::YES>,                                              //
-    LoadStoreConf<Scalar64BitBackend, 4, Temporality::TEMPORAL, Aligned::NO>, //
-    LoadStoreConf<Scalar64BitBackend, 4, Temporality::TEMPORAL,
-                  Aligned::YES>,                                              //
-    LoadStoreConf<Scalar64BitBackend, 8, Temporality::TEMPORAL, Aligned::NO>, //
-    LoadStoreConf<Scalar64BitBackend, 8, Temporality::TEMPORAL, Aligned::YES> //
-    >;
-
-TYPED_TEST(LlvmLibcMemoryBackend, load, LoadStoreTypes) {
-  alignas(64) const auto expected = GetRandomBuffer<ParamType::SIZE>();
-  const auto loaded = ParamType::load(expected);
-  for (size_t i = 0; i < ParamType::SIZE; ++i)
-    EXPECT_EQ(loaded[i], expected[i]);
-}
-
-TYPED_TEST(LlvmLibcMemoryBackend, store, LoadStoreTypes) {
-  alignas(64) const auto expected = GetRandomBuffer<ParamType::SIZE>();
-  alignas(64) typename ParamType::BufferT stored;
-  ParamType::store(stored, expected);
-  for (size_t i = 0; i < ParamType::SIZE; ++i)
-    EXPECT_EQ(stored[i], expected[i]);
-}
-
-} // namespace __llvm_libc
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index d64b051946aecb..926edb71460cee 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1140,16 +1140,20 @@ size_t Writer::addEntryToStringTable(StringRef str) {
 Optional<coff_symbol16> Writer::createSymbol(Defined *def) {
   coff_symbol16 sym;
   switch (def->kind()) {
-  case Symbol::DefinedAbsoluteKind:
-    sym.Value = def->getRVA();
+  case Symbol::DefinedAbsoluteKind: {
+    auto *da = dyn_cast<DefinedAbsolute>(def);
+    // Note: COFF symbol can only store 32-bit values, so 64-bit absolute
+    // values will be truncated.
+    sym.Value = da->getVA();
     sym.SectionNumber = IMAGE_SYM_ABSOLUTE;
     break;
-  case Symbol::DefinedSyntheticKind:
-    // Relative symbols are unrepresentable in a COFF symbol table.
-    return None;
+  }
   default: {
     // Don't write symbols that won't be written to the output to the symbol
     // table.
+    // We also try to write DefinedSynthetic as a normal symbol. Some of these
+    // symbols do point to an actual chunk, like __safe_se_handler_table. Others
+    // like __ImageBase are outside of sections and thus cannot be represented.
     Chunk *c = def->getChunk();
     if (!c)
       return None;
diff --git a/lld/test/COFF/symtab-DefinedSynthetic.s b/lld/test/COFF/symtab-DefinedSynthetic.s
new file mode 100644
index 00000000000000..f6d8387652f3c1
--- /dev/null
+++ b/lld/test/COFF/symtab-DefinedSynthetic.s
@@ -0,0 +1,53 @@
+# REQUIRES: x86
+
+# The __guard_fids_table is a DefinedSynthetic when control flow guard is
+# enabled and there are entries to be added to the fids table. This test uses
+# this to check that DefinedSynthetic symbols are being written to the COFF
+# symbol table.
+
+# RUN: llvm-mc -triple x86_64-windows-msvc %s -filetype=obj -o %t.obj
+# RUN: lld-link %t.obj -guard:cf -out:%t.exe -entry:main -debug:symtab
+# RUN: llvm-readobj --symbols %t.exe | FileCheck --check-prefix=CHECK %s
+
+# CHECK:      Name: __guard_fids_table
+# CHECK-NEXT: Value:
+# CHECK-NEXT: Section: .rdata (2)
+
+
+# We need @feat.00 to have 0x800 to indicate /guard:cf.
+        .def     @feat.00;
+        .scl    3;
+        .type   0;
+        .endef
+        .globl  @feat.00
+@feat.00 = 0x800
+        .def     main; .scl    2; .type   32; .endef
+        .globl	main                            # -- Begin function main
+        .p2align	4, 0x90
+main:
+        retq
+                                        # -- End function
+        .section	.gfids$y,"dr"
+        .symidx main
+        .section	.giats$y,"dr"
+        .section	.gljmp$y,"dr"
+        .addrsig_sym main
+        .section  .rdata,"dr"
+
+.globl _load_config_used
+        .p2align 3
+_load_config_used:
+        .long 312
+        .fill 124, 1, 0
+        .quad __guard_fids_table
+        .quad __guard_fids_count
+        .long __guard_flags
+        .fill 12, 1, 0
+        .quad __guard_iat_table
+        .quad __guard_iat_count
+        .quad __guard_longjmp_table
+        .quad __guard_longjmp_count
+        .fill 72, 1, 0
+        .quad __guard_eh_cont_table
+        .quad __guard_eh_cont_count
+        .fill 32, 1, 0
diff --git a/lld/test/COFF/symtab.test b/lld/test/COFF/symtab.test
index ccf26fde2027b8..41419a942d87bc 100644
--- a/lld/test/COFF/symtab.test
+++ b/lld/test/COFF/symtab.test
@@ -75,7 +75,7 @@
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Symbol {
 # CHECK-NEXT:     Name: abs_symbol
-# CHECK-NEXT:     Value: 2662186735
+# CHECK-NEXT:     Value: 3735928559
 # CHECK-NEXT:     Section: IMAGE_SYM_ABSOLUTE (-1)
 # CHECK-NEXT:     BaseType: Null (0x0)
 # CHECK-NEXT:     ComplexType: Null (0x0)
diff --git a/lld/test/COFF/wrap-i386.s b/lld/test/COFF/wrap-i386.s
index fd1710f8c3cc5c..4684e3b25e470c 100644
--- a/lld/test/COFF/wrap-i386.s
+++ b/lld/test/COFF/wrap-i386.s
@@ -16,16 +16,16 @@
 // RUN: FileCheck --check-prefix=SYM2 %s < %t.dump
 // RUN: FileCheck --check-prefix=SYM3 %s < %t.dump
 
-// _foo = 0xffc11000 = 4290842624
-// ___wrap_foo = ffc11010 = 4290842640
+// _foo = 0x00011000 = 69632
+// ___wrap_foo = 0x00011010 = 69648
 // SYM1:      Name: _foo
-// SYM1-NEXT: Value: 4290842624
+// SYM1-NEXT: Value: 69632
 // SYM1-NEXT: Section: IMAGE_SYM_ABSOLUTE
 // SYM1-NEXT: BaseType: Null
 // SYM1-NEXT: ComplexType: Null
 // SYM1-NEXT: StorageClass: External
 // SYM2:      Name: ___wrap_foo
-// SYM2-NEXT: Value: 4290842640
+// SYM2-NEXT: Value: 69648
 // SYM2-NEXT: Section: IMAGE_SYM_ABSOLUTE
 // SYM2-NEXT: BaseType: Null
 // SYM2-NEXT: ComplexType: Null
diff --git a/lld/test/COFF/wrap.s b/lld/test/COFF/wrap.s
index d0afb7f14cdc5f..7a964fa61ce8ae 100644
--- a/lld/test/COFF/wrap.s
+++ b/lld/test/COFF/wrap.s
@@ -18,16 +18,16 @@
 // RUN: FileCheck --check-prefix=SYM2 %s < %t.dump
 // RUN: FileCheck --check-prefix=SYM3 %s < %t.dump
 
-// foo = 0xC0011000 = 3221295104
-// __wrap_foo = 0xC0011010 = 3221295120
+// foo = 0x00011000 = 69632
+// __wrap_foo = 0x00011010 = 69648
 // SYM1:      Name: foo
-// SYM1-NEXT: Value: 3221295104
+// SYM1-NEXT: Value: 69632
 // SYM1-NEXT: Section: IMAGE_SYM_ABSOLUTE
 // SYM1-NEXT: BaseType: Null
 // SYM1-NEXT: ComplexType: Null
 // SYM1-NEXT: StorageClass: External
 // SYM2:      Name: __wrap_foo
-// SYM2-NEXT: Value: 3221295120
+// SYM2-NEXT: Value: 69648
 // SYM2-NEXT: Section: IMAGE_SYM_ABSOLUTE
 // SYM2-NEXT: BaseType: Null
 // SYM2-NEXT: ComplexType: Null
diff --git a/lldb/include/lldb/Core/EmulateInstruction.h b/lldb/include/lldb/Core/EmulateInstruction.h
index a710c866d9803c..fa049d4180fbf0 100644
--- a/lldb/include/lldb/Core/EmulateInstruction.h
+++ b/lldb/include/lldb/Core/EmulateInstruction.h
@@ -375,8 +375,11 @@ class EmulateInstruction : public PluginInterface {
   virtual bool TestEmulation(Stream *out_stream, ArchSpec &arch,
                              OptionValueDictionary *test_data) = 0;
 
-  virtual bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                               RegisterInfo &reg_info) = 0;
+  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
+                       RegisterInfo &reg_info);
+
+  virtual llvm::Optional<RegisterInfo>
+  GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num) = 0;
 
   // Optional overrides
   virtual bool SetInstruction(const Opcode &insn_opcode,
diff --git a/lldb/source/Core/EmulateInstruction.cpp b/lldb/source/Core/EmulateInstruction.cpp
index 1320e8925553ed..271301b9d3831c 100644
--- a/lldb/source/Core/EmulateInstruction.cpp
+++ b/lldb/source/Core/EmulateInstruction.cpp
@@ -582,3 +582,12 @@ bool EmulateInstruction::CreateFunctionEntryUnwind(UnwindPlan &unwind_plan) {
   unwind_plan.Clear();
   return false;
 }
+
+bool EmulateInstruction::GetRegisterInfo(lldb::RegisterKind reg_kind,
+                                         uint32_t reg_num,
+                                         RegisterInfo &reg_info) {
+  llvm::Optional<RegisterInfo> info = GetRegisterInfo(reg_kind, reg_num);
+  if (info)
+    reg_info = *info;
+  return info.has_value();
+}
\ No newline at end of file
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index fa6511635e287c..5c11b87dcbe03c 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2471,8 +2471,12 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
 
   for (const auto &thread_sp : process_sp->GetThreadList().Threads()) {
     StopInfoSP stop_info = thread_sp->GetStopInfo();
-    if (!stop_info)
-      return false;
+    if (!stop_info) {
+      // If there's no stop_info, keep iterating through the other threads;
+      // it's enough that any thread has got a stop_info that indicates
+      // an abnormal stop, to consider the process to be stopped abnormally.
+      continue;
+    }
 
     const StopReason reason = stop_info->GetStopReason();
     if (reason == eStopReasonException ||
diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
index 0abfefa43e099b..54aec79d24773e 100644
--- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
+++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
@@ -42,7 +42,8 @@ LLDB_PLUGIN_DEFINE_ADV(EmulateInstructionARM, InstructionARM)
 // ITSession implementation
 //
 
-static bool GetARMDWARFRegisterInfo(unsigned reg_num, RegisterInfo &reg_info) {
+static llvm::Optional<RegisterInfo> GetARMDWARFRegisterInfo(unsigned reg_num) {
+  RegisterInfo reg_info;
   ::memset(&reg_info, 0, sizeof(RegisterInfo));
   ::memset(reg_info.kinds, LLDB_INVALID_REGNUM, sizeof(reg_info.kinds));
 
@@ -594,9 +595,9 @@ static bool GetARMDWARFRegisterInfo(unsigned reg_num, RegisterInfo &reg_info) {
     break;
 
   default:
-    return false;
+    return {};
   }
-  return true;
+  return reg_info;
 }
 
 // A8.6.50
@@ -782,9 +783,9 @@ bool EmulateInstructionARM::WriteBits32Unknown(int n) {
   return true;
 }
 
-bool EmulateInstructionARM::GetRegisterInfo(lldb::RegisterKind reg_kind,
-                                            uint32_t reg_num,
-                                            RegisterInfo &reg_info) {
+llvm::Optional<RegisterInfo>
+EmulateInstructionARM::GetRegisterInfo(lldb::RegisterKind reg_kind,
+                                       uint32_t reg_num) {
   if (reg_kind == eRegisterKindGeneric) {
     switch (reg_num) {
     case LLDB_REGNUM_GENERIC_PC:
@@ -808,13 +809,13 @@ bool EmulateInstructionARM::GetRegisterInfo(lldb::RegisterKind reg_kind,
       reg_num = dwarf_cpsr;
       break;
     default:
-      return false;
+      return {};
     }
   }
 
   if (reg_kind == eRegisterKindDWARF)
-    return GetARMDWARFRegisterInfo(reg_num, reg_info);
-  return false;
+    return GetARMDWARFRegisterInfo(reg_num);
+  return {};
 }
 
 uint32_t EmulateInstructionARM::GetFramePointerRegisterNumber() const {
diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.h b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.h
index c877724a9d3054..9a51445f9c1a96 100644
--- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.h
+++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.h
@@ -135,8 +135,9 @@ class EmulateInstructionARM : public EmulateInstruction {
   bool TestEmulation(Stream *out_stream, ArchSpec &arch,
                      OptionValueDictionary *test_data) override;
 
-  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                       RegisterInfo &reg_info) override;
+  using EmulateInstruction::GetRegisterInfo;
+  llvm::Optional<RegisterInfo> GetRegisterInfo(lldb::RegisterKind reg_kind,
+                                               uint32_t reg_num) override;
 
   bool CreateFunctionEntryUnwind(UnwindPlan &unwind_plan) override;
 
diff --git a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
index 6ab77d30564b9a..96a7caa29981a0 100644
--- a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
+++ b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.cpp
@@ -51,11 +51,10 @@ using namespace lldb_private;
 
 LLDB_PLUGIN_DEFINE_ADV(EmulateInstructionARM64, InstructionARM64)
 
-static bool LLDBTableGetRegisterInfo(uint32_t reg_num, RegisterInfo &reg_info) {
+static llvm::Optional<RegisterInfo> LLDBTableGetRegisterInfo(uint32_t reg_num) {
   if (reg_num >= std::size(g_register_infos_arm64_le))
-    return false;
-  reg_info = g_register_infos_arm64_le[reg_num];
-  return true;
+    return {};
+  return g_register_infos_arm64_le[reg_num];
 }
 
 #define No_VFP 0
@@ -144,9 +143,9 @@ bool EmulateInstructionARM64::SetTargetTriple(const ArchSpec &arch) {
   return false;
 }
 
-bool EmulateInstructionARM64::GetRegisterInfo(RegisterKind reg_kind,
-                                              uint32_t reg_num,
-                                              RegisterInfo &reg_info) {
+llvm::Optional<RegisterInfo>
+EmulateInstructionARM64::GetRegisterInfo(RegisterKind reg_kind,
+                                         uint32_t reg_num) {
   if (reg_kind == eRegisterKindGeneric) {
     switch (reg_num) {
     case LLDB_REGNUM_GENERIC_PC:
@@ -171,13 +170,13 @@ bool EmulateInstructionARM64::GetRegisterInfo(RegisterKind reg_kind,
       break;
 
     default:
-      return false;
+      return {};
     }
   }
 
   if (reg_kind == eRegisterKindLLDB)
-    return LLDBTableGetRegisterInfo(reg_num, reg_info);
-  return false;
+    return LLDBTableGetRegisterInfo(reg_num);
+  return {};
 }
 
 EmulateInstructionARM64::Opcode *
diff --git a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h
index 4f11f7387a2ec5..20b1c33c66cda2 100644
--- a/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h
+++ b/lldb/source/Plugins/Instruction/ARM64/EmulateInstructionARM64.h
@@ -65,8 +65,10 @@ class EmulateInstructionARM64 : public lldb_private::EmulateInstruction {
     return false;
   }
 
-  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                       lldb_private::RegisterInfo &reg_info) override;
+  using EmulateInstruction::GetRegisterInfo;
+
+  llvm::Optional<lldb_private::RegisterInfo>
+  GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num) override;
 
   bool
   CreateFunctionEntryUnwind(lldb_private::UnwindPlan &unwind_plan) override;
diff --git a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
index 7aff11ede400dc..37096a5cc67047 100644
--- a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
+++ b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.cpp
@@ -585,9 +585,9 @@ const char *EmulateInstructionMIPS::GetRegisterName(unsigned reg_num,
   return nullptr;
 }
 
-bool EmulateInstructionMIPS::GetRegisterInfo(RegisterKind reg_kind,
-                                             uint32_t reg_num,
-                                             RegisterInfo &reg_info) {
+llvm::Optional<RegisterInfo>
+EmulateInstructionMIPS::GetRegisterInfo(RegisterKind reg_kind,
+                                        uint32_t reg_num) {
   if (reg_kind == eRegisterKindGeneric) {
     switch (reg_num) {
     case LLDB_REGNUM_GENERIC_PC:
@@ -611,11 +611,12 @@ bool EmulateInstructionMIPS::GetRegisterInfo(RegisterKind reg_kind,
       reg_num = dwarf_sr_mips;
       break;
     default:
-      return false;
+      return {};
     }
   }
 
   if (reg_kind == eRegisterKindDWARF) {
+    RegisterInfo reg_info;
     ::memset(&reg_info, 0, sizeof(RegisterInfo));
     ::memset(reg_info.kinds, LLDB_INVALID_REGNUM, sizeof(reg_info.kinds));
 
@@ -636,7 +637,7 @@ bool EmulateInstructionMIPS::GetRegisterInfo(RegisterKind reg_kind,
       reg_info.format = eFormatVectorOfUInt8;
       reg_info.encoding = eEncodingVector;
     } else {
-      return false;
+      return {};
     }
 
     reg_info.name = GetRegisterName(reg_num, false);
@@ -662,9 +663,9 @@ bool EmulateInstructionMIPS::GetRegisterInfo(RegisterKind reg_kind,
     default:
       break;
     }
-    return true;
+    return reg_info;
   }
-  return false;
+  return {};
 }
 
 EmulateInstructionMIPS::MipsOpcode *
diff --git a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h
index 4862f6c7e0dc54..e771bda2e1dea5 100644
--- a/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h
+++ b/lldb/source/Plugins/Instruction/MIPS/EmulateInstructionMIPS.h
@@ -80,8 +80,10 @@ class EmulateInstructionMIPS : public lldb_private::EmulateInstruction {
     return false;
   }
 
-  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                       lldb_private::RegisterInfo &reg_info) override;
+  using EmulateInstruction::GetRegisterInfo;
+
+  llvm::Optional<lldb_private::RegisterInfo>
+  GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num) override;
 
   bool
   CreateFunctionEntryUnwind(lldb_private::UnwindPlan &unwind_plan) override;
diff --git a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
index b4a860af54bd90..341d954e74be6f 100644
--- a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
+++ b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.cpp
@@ -572,9 +572,9 @@ const char *EmulateInstructionMIPS64::GetRegisterName(unsigned reg_num,
   return nullptr;
 }
 
-bool EmulateInstructionMIPS64::GetRegisterInfo(RegisterKind reg_kind,
-                                               uint32_t reg_num,
-                                               RegisterInfo &reg_info) {
+llvm::Optional<RegisterInfo>
+EmulateInstructionMIPS64::GetRegisterInfo(RegisterKind reg_kind,
+                                          uint32_t reg_num) {
   if (reg_kind == eRegisterKindGeneric) {
     switch (reg_num) {
     case LLDB_REGNUM_GENERIC_PC:
@@ -598,11 +598,12 @@ bool EmulateInstructionMIPS64::GetRegisterInfo(RegisterKind reg_kind,
       reg_num = dwarf_sr_mips64;
       break;
     default:
-      return false;
+      return {};
     }
   }
 
   if (reg_kind == eRegisterKindDWARF) {
+    RegisterInfo reg_info;
     ::memset(&reg_info, 0, sizeof(RegisterInfo));
     ::memset(reg_info.kinds, LLDB_INVALID_REGNUM, sizeof(reg_info.kinds));
 
@@ -623,7 +624,7 @@ bool EmulateInstructionMIPS64::GetRegisterInfo(RegisterKind reg_kind,
       reg_info.format = eFormatVectorOfUInt8;
       reg_info.encoding = eEncodingVector;
     } else {
-      return false;
+      return {};
     }
 
     reg_info.name = GetRegisterName(reg_num, false);
@@ -649,9 +650,9 @@ bool EmulateInstructionMIPS64::GetRegisterInfo(RegisterKind reg_kind,
     default:
       break;
     }
-    return true;
+    return reg_info;
   }
-  return false;
+  return {};
 }
 
 EmulateInstructionMIPS64::MipsOpcode *
diff --git a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h
index 3f56bc658c16e6..9c8a95a64f942c 100644
--- a/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h
+++ b/lldb/source/Plugins/Instruction/MIPS64/EmulateInstructionMIPS64.h
@@ -72,8 +72,10 @@ class EmulateInstructionMIPS64 : public lldb_private::EmulateInstruction {
     return false;
   }
 
-  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                       lldb_private::RegisterInfo &reg_info) override;
+  using EmulateInstruction::GetRegisterInfo;
+
+  llvm::Optional<lldb_private::RegisterInfo>
+  GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num) override;
 
   bool
   CreateFunctionEntryUnwind(lldb_private::UnwindPlan &unwind_plan) override;
diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
index 4b56a9b6b8c51a..19598ebfd4c30a 100644
--- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
+++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
@@ -58,16 +58,15 @@ bool EmulateInstructionPPC64::SetTargetTriple(const ArchSpec &arch) {
   return arch.GetTriple().isPPC64();
 }
 
-static bool LLDBTableGetRegisterInfo(uint32_t reg_num, RegisterInfo &reg_info) {
+static llvm::Optional<RegisterInfo> LLDBTableGetRegisterInfo(uint32_t reg_num) {
   if (reg_num >= std::size(g_register_infos_ppc64le))
-    return false;
-  reg_info = g_register_infos_ppc64le[reg_num];
-  return true;
+    return {};
+  return g_register_infos_ppc64le[reg_num];
 }
 
-bool EmulateInstructionPPC64::GetRegisterInfo(RegisterKind reg_kind,
-                                              uint32_t reg_num,
-                                              RegisterInfo &reg_info) {
+llvm::Optional<RegisterInfo>
+EmulateInstructionPPC64::GetRegisterInfo(RegisterKind reg_kind,
+                                         uint32_t reg_num) {
   if (reg_kind == eRegisterKindGeneric) {
     switch (reg_num) {
     case LLDB_REGNUM_GENERIC_PC:
@@ -88,13 +87,13 @@ bool EmulateInstructionPPC64::GetRegisterInfo(RegisterKind reg_kind,
       break;
 
     default:
-      return false;
+      return {};
     }
   }
 
   if (reg_kind == eRegisterKindLLDB)
-    return LLDBTableGetRegisterInfo(reg_num, reg_info);
-  return false;
+    return LLDBTableGetRegisterInfo(reg_num);
+  return {};
 }
 
 bool EmulateInstructionPPC64::ReadInstruction() {
diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
index 117ff8965eb5c4..b0d9130bfb068b 100644
--- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
+++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
@@ -61,8 +61,10 @@ class EmulateInstructionPPC64 : public EmulateInstruction {
     return false;
   }
 
-  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                       RegisterInfo &reg_info) override;
+  using EmulateInstruction::GetRegisterInfo;
+
+  llvm::Optional<RegisterInfo> GetRegisterInfo(lldb::RegisterKind reg_kind,
+                                               uint32_t reg_num) override;
 
   bool CreateFunctionEntryUnwind(UnwindPlan &unwind_plan) override;
 
diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
index bcd18ff63d11ba..f84c1159f254da 100644
--- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
+++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
@@ -1286,9 +1286,9 @@ bool EmulateInstructionRISCV::WritePC(lldb::addr_t pc) {
                                LLDB_REGNUM_GENERIC_PC, pc);
 }
 
-bool EmulateInstructionRISCV::GetRegisterInfo(lldb::RegisterKind reg_kind,
-                                              uint32_t reg_index,
-                                              RegisterInfo &reg_info) {
+llvm::Optional<RegisterInfo>
+EmulateInstructionRISCV::GetRegisterInfo(lldb::RegisterKind reg_kind,
+                                         uint32_t reg_index) {
   if (reg_kind == eRegisterKindGeneric) {
     switch (reg_index) {
     case LLDB_REGNUM_GENERIC_PC:
@@ -1320,10 +1320,9 @@ bool EmulateInstructionRISCV::GetRegisterInfo(lldb::RegisterKind reg_kind,
       RegisterInfoPOSIX_riscv64::GetRegisterInfoCount(m_arch);
 
   if (reg_index >= length || reg_kind != eRegisterKindLLDB)
-    return false;
+    return {};
 
-  reg_info = array[reg_index];
-  return true;
+  return array[reg_index];
 }
 
 bool EmulateInstructionRISCV::SetTargetTriple(const ArchSpec &arch) {
diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h
index 1c7cf6cb08d66f..92f5c950c26ad3 100644
--- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h
+++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h
@@ -76,8 +76,10 @@ class EmulateInstructionRISCV : public EmulateInstruction {
   bool EvaluateInstruction(uint32_t options) override;
   bool TestEmulation(Stream *out_stream, ArchSpec &arch,
                      OptionValueDictionary *test_data) override;
-  bool GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                       RegisterInfo &reg_info) override;
+  using EmulateInstruction::GetRegisterInfo;
+
+  llvm::Optional<RegisterInfo> GetRegisterInfo(lldb::RegisterKind reg_kind,
+                                               uint32_t reg_num) override;
 
   lldb::addr_t ReadPC(bool &success);
   bool WritePC(lldb::addr_t pc);
diff --git a/lldb/test/Shell/Driver/CommandOnCrashMultiThreaded.test b/lldb/test/Shell/Driver/CommandOnCrashMultiThreaded.test
new file mode 100644
index 00000000000000..b16cfc5763715b
--- /dev/null
+++ b/lldb/test/Shell/Driver/CommandOnCrashMultiThreaded.test
@@ -0,0 +1,5 @@
+# REQUIRES: native && (target-x86 || target-x86_64)
+# RUN: %clangxx_host %p/Inputs/CommandOnCrashMultiThreaded.cpp -o %t -pthread
+# RUN: %lldb -b -o "process launch" -k "process continue" -k "exit" %t | FileCheck %s
+
+# CHECK: Process {{[0-9]+}} exited with status = 0
diff --git a/lldb/test/Shell/Driver/Inputs/CommandOnCrashMultiThreaded.cpp b/lldb/test/Shell/Driver/Inputs/CommandOnCrashMultiThreaded.cpp
new file mode 100644
index 00000000000000..f469d82fbbef9d
--- /dev/null
+++ b/lldb/test/Shell/Driver/Inputs/CommandOnCrashMultiThreaded.cpp
@@ -0,0 +1,13 @@
+#include <thread>
+
+void t_func() {
+  asm volatile(
+    "int3\n\t"
+  );
+}
+
+int main() {
+  std::thread t(t_func);
+  t.join();
+  return 0;
+}
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 045704a470b9cc..4064d716f8a774 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -195,7 +195,7 @@ template <typename Bitfield, typename StorageType> struct Impl {
 /// API.
 template <typename T, bool = std::is_enum<T>::value>
 struct ResolveUnderlyingType {
-  using type = typename std::underlying_type<T>::type;
+  using type = std::underlying_type_t<T>;
 };
 template <typename T> struct ResolveUnderlyingType<T, false> {
   using type = T;
diff --git a/llvm/include/llvm/ADT/Sequence.h b/llvm/include/llvm/ADT/Sequence.h
index 96935c291ec1e4..88a6fa92059836 100644
--- a/llvm/include/llvm/ADT/Sequence.h
+++ b/llvm/include/llvm/ADT/Sequence.h
@@ -139,7 +139,7 @@ struct CheckedInt {
   template <typename Enum,
             typename std::enable_if_t<std::is_enum<Enum>::value, bool> = 0>
   static CheckedInt from(Enum FromValue) {
-    using type = typename std::underlying_type<Enum>::type;
+    using type = std::underlying_type_t<Enum>;
     return from<type>(static_cast<type>(FromValue));
   }
 
@@ -175,7 +175,7 @@ struct CheckedInt {
   template <typename Enum,
             typename std::enable_if_t<std::is_enum<Enum>::value, bool> = 0>
   Enum to() const {
-    using type = typename std::underlying_type<Enum>::type;
+    using type = std::underlying_type_t<Enum>;
     return Enum(to<type>());
   }
 
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 14f051e383f725..513fce93549fb5 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -468,8 +468,7 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
 
   /// Either const T& or T, depending on whether it's cheap enough to take
   /// parameters by value.
-  using ValueParamT =
-      typename std::conditional<TakesParamByValue, T, const T &>::type;
+  using ValueParamT = std::conditional_t<TakesParamByValue, T, const T &>;
 
   SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
 
diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index a0a5cb8bf356c0..6596cb3375e1bf 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -95,6 +95,9 @@ class InlineCost {
   /// The adjusted threshold against which this cost was computed.
   int Threshold = 0;
 
+  /// The amount of StaticBonus that has been applied.
+  int StaticBonusApplied = 0;
+
   /// Must be set for Always and Never instances.
   const char *Reason = nullptr;
 
@@ -102,27 +105,29 @@ class InlineCost {
   Optional<CostBenefitPair> CostBenefit = None;
 
   // Trivial constructor, interesting logic in the factory functions below.
-  InlineCost(int Cost, int Threshold, const char *Reason = nullptr,
+  InlineCost(int Cost, int Threshold, int StaticBonusApplied,
+             const char *Reason = nullptr,
              Optional<CostBenefitPair> CostBenefit = None)
-      : Cost(Cost), Threshold(Threshold), Reason(Reason),
+      : Cost(Cost), Threshold(Threshold),
+        StaticBonusApplied(StaticBonusApplied), Reason(Reason),
         CostBenefit(CostBenefit) {
     assert((isVariable() || Reason) &&
            "Reason must be provided for Never or Always");
   }
 
 public:
-  static InlineCost get(int Cost, int Threshold) {
+  static InlineCost get(int Cost, int Threshold, int StaticBonus = 0) {
     assert(Cost > AlwaysInlineCost && "Cost crosses sentinel value");
     assert(Cost < NeverInlineCost && "Cost crosses sentinel value");
-    return InlineCost(Cost, Threshold);
+    return InlineCost(Cost, Threshold, StaticBonus);
   }
   static InlineCost getAlways(const char *Reason,
                               Optional<CostBenefitPair> CostBenefit = None) {
-    return InlineCost(AlwaysInlineCost, 0, Reason, CostBenefit);
+    return InlineCost(AlwaysInlineCost, 0, 0, Reason, CostBenefit);
   }
   static InlineCost getNever(const char *Reason,
                              Optional<CostBenefitPair> CostBenefit = None) {
-    return InlineCost(NeverInlineCost, 0, Reason, CostBenefit);
+    return InlineCost(NeverInlineCost, 0, 0, Reason, CostBenefit);
   }
 
   /// Test whether the inline cost is low enough for inlining.
@@ -145,6 +150,12 @@ class InlineCost {
     return Threshold;
   }
 
+  /// Get the amount of StaticBonus applied.
+  int getStaticBonusApplied() const {
+    assert(isVariable() && "Invalid access of InlineCost");
+    return StaticBonusApplied;
+  }
+
   /// Get the cost-benefit pair which was computed by cost-benefit analysis
   Optional<CostBenefitPair> getCostBenefit() const { return CostBenefit; }
 
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
index b7a3e1561a0794..010a82dd0e232c 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -51,18 +51,15 @@ enum SymbolKind : uint16_t {
 
 #define CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(Class)                            \
   inline Class operator|(Class a, Class b) {                                   \
-    return static_cast<Class>(                                                 \
-        static_cast<std::underlying_type<Class>::type>(a) |                    \
-        static_cast<std::underlying_type<Class>::type>(b));                    \
+    return static_cast<Class>(static_cast<std::underlying_type_t<Class>>(a) |  \
+                              static_cast<std::underlying_type_t<Class>>(b));  \
   }                                                                            \
   inline Class operator&(Class a, Class b) {                                   \
-    return static_cast<Class>(                                                 \
-        static_cast<std::underlying_type<Class>::type>(a) &                    \
-        static_cast<std::underlying_type<Class>::type>(b));                    \
+    return static_cast<Class>(static_cast<std::underlying_type_t<Class>>(a) &  \
+                              static_cast<std::underlying_type_t<Class>>(b));  \
   }                                                                            \
   inline Class operator~(Class a) {                                            \
-    return static_cast<Class>(                                                 \
-        ~static_cast<std::underlying_type<Class>::type>(a));                   \
+    return static_cast<Class>(~static_cast<std::underlying_type_t<Class>>(a)); \
   }                                                                            \
   inline Class &operator|=(Class &a, Class b) {                                \
     a = a | b;                                                                 \
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index e193ceea86f767..e17b90844c9a5c 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -549,6 +549,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   /// for speculative "expected profit" of the inlining decision.
   int Threshold = 0;
 
+  /// The amount of StaticBonus applied.
+  int StaticBonusApplied = 0;
+
   /// Attempt to evaluate indirect calls to boost its inline cost.
   const bool BoostIndirectCalls;
 
@@ -1058,6 +1061,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   virtual ~InlineCostCallAnalyzer() = default;
   int getThreshold() const { return Threshold; }
   int getCost() const { return Cost; }
+  int getStaticBonusApplied() const { return StaticBonusApplied; }
   Optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
   bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
   bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
@@ -1922,8 +1926,10 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   // If there is only one call of the function, and it has internal linkage,
   // the cost of inlining it drops dramatically. It may seem odd to update
   // Cost in updateThreshold, but the bonus depends on the logic in this method.
-  if (isSoleCallToLocalFunction(Call, F))
+  if (isSoleCallToLocalFunction(Call, F)) {
     Cost -= LastCallToStaticBonus;
+    StaticBonusApplied = LastCallToStaticBonus;
+  }
 }
 
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
@@ -2970,7 +2976,8 @@ InlineCost llvm::getInlineCost(
   }
 
   if (CA.wasDecidedByCostThreshold())
-    return InlineCost::get(CA.getCost(), CA.getThreshold());
+    return InlineCost::get(CA.getCost(), CA.getThreshold(),
+                           CA.getStaticBonusApplied());
 
   // No details on how the decision was made, simply return always or never.
   return ShouldInline.isSuccess()
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
index 2cc2bddeb21a10..ec53338570db27 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcError.cpp
@@ -82,7 +82,7 @@ char DuplicateDefinition::ID = 0;
 char JITSymbolNotFound::ID = 0;
 
 std::error_code orcError(OrcErrorCode ErrCode) {
-  typedef std::underlying_type<OrcErrorCode>::type UT;
+  typedef std::underlying_type_t<OrcErrorCode> UT;
   return std::error_code(static_cast<UT>(ErrCode), getOrcErrCat());
 }
 
@@ -105,7 +105,7 @@ JITSymbolNotFound::JITSymbolNotFound(std::string SymbolName)
     : SymbolName(std::move(SymbolName)) {}
 
 std::error_code JITSymbolNotFound::convertToErrorCode() const {
-  typedef std::underlying_type<OrcErrorCode>::type UT;
+  typedef std::underlying_type_t<OrcErrorCode> UT;
   return std::error_code(static_cast<UT>(OrcErrorCode::JITSymbolNotFound),
                          getOrcErrCat());
 }
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index e37e1d58983040..68d33669060cfd 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -668,6 +668,7 @@ def TuneA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureBalanceFPOps,
                                    FeatureCustomCheapAsMoveHandling,
                                    FeaturePostRAScheduler]>;
@@ -675,12 +676,14 @@ def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
 def TuneA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
                                    "Cortex-A55 ARM processors", [
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler,
                                    FeatureFuseAddress]>;
 
 def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    "Cortex-A510 ARM processors", [
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler
                                    ]>;
 
@@ -709,27 +712,32 @@ def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
 
 def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", [
-                                   FeatureFuseAES]>;
+                                   FeatureFuseAES,
+                                   FeatureFuseAdrpAdd]>;
 
 def TuneA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", [
-                                   FeatureFuseAES]>;
+                                   FeatureFuseAES,
+                                   FeatureFuseAdrpAdd]>;
 
 def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureLSLFast]>;
 
 def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    "Cortex-A77 ARM processors", [
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureLSLFast]>;
 
 def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                "Cortex-A78 ARM processors", [
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
+                               FeatureFuseAdrpAdd,
                                FeatureLSLFast,
                                FeaturePostRAScheduler]>;
 
@@ -738,6 +746,7 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 "Cortex-A78C ARM processors", [
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
+                                FeatureFuseAdrpAdd,
                                 FeatureLSLFast,
                                 FeaturePostRAScheduler]>;
 
@@ -745,6 +754,7 @@ def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    "Cortex-A710 ARM processors", [
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
                                    FeatureLSLFast,
                                    FeaturePostRAScheduler]>;
 
@@ -757,6 +767,7 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   "Cortex-X1 ARM processors", [
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
+                                  FeatureFuseAdrpAdd,
                                   FeatureLSLFast,
                                   FeaturePostRAScheduler]>;
 
@@ -764,6 +775,7 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   "Cortex-X2 ARM processors", [
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
+                                  FeatureFuseAdrpAdd,
                                   FeatureLSLFast,
                                   FeaturePostRAScheduler]>;
 
@@ -941,6 +953,7 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
 def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
                                       "Neoverse E1 ARM processors", [
                                       FeatureFuseAES,
+                                      FeatureFuseAdrpAdd,
                                       FeaturePostRAScheduler]>;
 
 def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1",
@@ -953,18 +966,21 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
 def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2",
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
+                                      FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
                                       FeaturePostRAScheduler]>;
 
 def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
+                                      FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
                                       FeaturePostRAScheduler]>;
 
 def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
+                                      FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
                                       FeaturePostRAScheduler]>;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 0cc97e3b9e0f19..544fd0efd05bcf 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21191,6 +21191,21 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return Subtarget->hasV6T2Ops();
 }
 
+bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  if (!Subtarget->hasV7Ops())
+    return false;
+
+  // Sink the `and` instruction only if the mask would fit into a modified
+  // immediate operand.
+  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask || Mask->getValue().getBitWidth() > 32u)
+    return false;
+  auto MaskVal = unsigned(Mask->getValue().getZExtValue());
+  return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
+                                : ARM_AM::getSOImmVal(MaskVal)) != -1;
+}
+
 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
   return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9ff920f230e22c..1403e4c8c0a114 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -584,6 +584,8 @@ class VectorType;
 
     bool preferZeroCompareBranch() const override { return true; }
 
+    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
     bool
     isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index d9993e0a616f76..9a901593c523c7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -157,11 +157,8 @@ def : Pat<(fcanonicalize FPR32:$fj), (FMAX_S $fj, $fj)>;
 
 // Match non-signaling comparison
 
-// TODO: change setcc to any_fsetcc after call is supported because
-// we need to call llvm.experimental.constrained.fcmp.f32 in testcase.
-// See RISCV float-fcmp-strict.ll for reference.
 class PatFPSetcc<CondCode cc, LAInst CmpInst, RegisterClass RegTy>
-    : Pat<(setcc RegTy:$fj, RegTy:$fk, cc),
+    : Pat<(any_fsetcc RegTy:$fj, RegTy:$fk, cc),
           (MOVCF2GR (CmpInst RegTy:$fj, RegTy:$fk))>;
 // SETOGT/SETOGE/SETUGT/SETUGE will expand into SETOLT/SETOLE/SETULT/SETULE.
 def : PatFPSetcc<SETOEQ, FCMP_CEQ_S,  FPR32>;
@@ -196,7 +193,22 @@ defm : PatFPBrcond<SETUNE, FCMP_CUNE_S, FPR32>;
 defm : PatFPBrcond<SETUO,  FCMP_CUN_S, FPR32>;
 defm : PatFPBrcond<SETLT,  FCMP_CLT_S, FPR32>;
 
-// TODO: Match signaling comparison strict_fsetccs with FCMP_S*_S instructions.
+// Match signaling comparison
+
+class PatStrictFsetccs<CondCode cc, LAInst CmpInst, RegisterClass RegTy>
+    : Pat<(strict_fsetccs RegTy:$fj, RegTy:$fk, cc),
+          (MOVCF2GR (CmpInst RegTy:$fj, RegTy:$fk))>;
+def : PatStrictFsetccs<SETOEQ, FCMP_SEQ_S,  FPR32>;
+def : PatStrictFsetccs<SETOLT, FCMP_SLT_S,  FPR32>;
+def : PatStrictFsetccs<SETOLE, FCMP_SLE_S,  FPR32>;
+def : PatStrictFsetccs<SETONE, FCMP_SNE_S,  FPR32>;
+def : PatStrictFsetccs<SETO,   FCMP_SOR_S,  FPR32>;
+def : PatStrictFsetccs<SETUEQ, FCMP_SUEQ_S, FPR32>;
+def : PatStrictFsetccs<SETULT, FCMP_SULT_S, FPR32>;
+def : PatStrictFsetccs<SETULE, FCMP_SULE_S, FPR32>;
+def : PatStrictFsetccs<SETUNE, FCMP_SUNE_S, FPR32>;
+def : PatStrictFsetccs<SETUO,  FCMP_SUN_S,  FPR32>;
+def : PatStrictFsetccs<SETLT,  FCMP_SLT_S,  FPR32>;
 
 /// Select
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 42a02514ffacb2..9fb9b99d32f3a3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -168,10 +168,6 @@ def : Pat<(fcanonicalize FPR64:$fj), (FMAX_D $fj, $fj)>;
 
 // Match non-signaling comparison
 
-// TODO: Change setcc to any_fsetcc after call is supported because
-// we need to call llvm.experimental.constrained.fcmp.f64 in testcase.
-// See RISCV float-fcmp-strict.ll for reference.
-
 // SETOGT/SETOGE/SETUGT/SETUGE will expand into SETOLT/SETOLE/SETULT/SETULE.
 def : PatFPSetcc<SETOEQ, FCMP_CEQ_D,  FPR64>;
 def : PatFPSetcc<SETOLT, FCMP_CLT_D,  FPR64>;
@@ -197,7 +193,19 @@ defm : PatFPBrcond<SETUNE, FCMP_CUNE_D, FPR64>;
 defm : PatFPBrcond<SETUO,  FCMP_CUN_D, FPR64>;
 defm : PatFPBrcond<SETLT,  FCMP_CLT_D, FPR64>;
 
-// TODO: Match signaling comparison strict_fsetccs with FCMP_S*_D instructions.
+// Match signaling comparison
+
+def : PatStrictFsetccs<SETOEQ, FCMP_SEQ_D,  FPR64>;
+def : PatStrictFsetccs<SETOLT, FCMP_SLT_D,  FPR64>;
+def : PatStrictFsetccs<SETOLE, FCMP_SLE_D,  FPR64>;
+def : PatStrictFsetccs<SETONE, FCMP_SNE_D,  FPR64>;
+def : PatStrictFsetccs<SETO,   FCMP_SOR_D,  FPR64>;
+def : PatStrictFsetccs<SETUEQ, FCMP_SUEQ_D, FPR64>;
+def : PatStrictFsetccs<SETULT, FCMP_SULT_D, FPR64>;
+def : PatStrictFsetccs<SETULE, FCMP_SULE_D, FPR64>;
+def : PatStrictFsetccs<SETUNE, FCMP_SUNE_D, FPR64>;
+def : PatStrictFsetccs<SETUO,  FCMP_SUN_D,  FPR64>;
+def : PatStrictFsetccs<SETLT,  FCMP_SLT_D,  FPR64>;
 
 /// Select
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 920a9da58b859a..c2b1b443444e0c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -118,11 +118,15 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f32, Legal);
     setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
   }
   if (Subtarget.hasBasicD()) {
     setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     setOperationAction(ISD::FMA, MVT::f64, Legal);
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 8c2763cb7aff16..2c87492b8242f8 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -201,6 +201,10 @@ void VETargetLowering::initSPUActions() {
     setOperationAction(ISD::AND, IntVT, Act);
     setOperationAction(ISD::OR, IntVT, Act);
     setOperationAction(ISD::XOR, IntVT, Act);
+
+    // Legal smax and smin
+    setOperationAction(ISD::SMAX, IntVT, Legal);
+    setOperationAction(ISD::SMIN, IntVT, Legal);
   }
   /// } Int Ops
 
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 528bebd4276a05..d8eb65185a7024 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -1237,14 +1237,14 @@ defm CMPSL : RRNCm<"cmps.l", 0x6A, I64, i64>;
 
 // Section 8.4.17 - CMS (Compare and Select Maximum/Minimum Single)
 // cx: sx/zx, cw: max/min
-defm MAXSWSX : RRm<"maxs.w.sx", 0x78, I32, i32>;
+defm MAXSWSX : RRm<"maxs.w.sx", 0x78, I32, i32, smax>;
 let cx = 1 in defm MAXSWZX : RRm<"maxs.w.zx", 0x78, I32, i32>;
-let cw = 1 in defm MINSWSX : RRm<"mins.w.sx", 0x78, I32, i32>;
+let cw = 1 in defm MINSWSX : RRm<"mins.w.sx", 0x78, I32, i32, smin>;
 let cx = 1, cw = 1 in defm MINSWZX : RRm<"mins.w.zx", 0x78, I32, i32>;
 
 // Section 8.4.18 - CMX (Compare and Select Maximum/Minimum)
-defm MAXSL : RRm<"maxs.l", 0x68, I64, i64>;
-let cw = 1 in defm MINSL : RRm<"mins.l", 0x68, I64, i64>;
+defm MAXSL : RRm<"maxs.l", 0x68, I64, i64, smax>;
+let cw = 1 in defm MINSL : RRm<"mins.l", 0x68, I64, i64, smin>;
 
 } // isReMaterializable, isAsCheapAsAMove
 
@@ -2052,45 +2052,6 @@ def : Pat<(i32 (setcc f64:$l, f64:$r, cond:$cond)),
 def : Pat<(i32 (setcc f128:$l, f128:$r, cond:$cond)),
           (setccrr<CMOVDrm> (fcond2cc $cond), (FCMPQrr $l, $r))>;
 
-// Special SELECTCC pattern matches
-// Use min/max for better performance.
-//
-//   MAX/MIN  %res, %lhs, %rhs
-
-def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOGT)),
-          (FMAXDrr $LHS, $RHS)>;
-def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOGT)),
-          (FMAXSrr $LHS, $RHS)>;
-def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETGT)),
-          (MAXSLrr $LHS, $RHS)>;
-def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETGT)),
-          (MAXSWSXrr $LHS, $RHS)>;
-def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOGE)),
-          (FMAXDrr $LHS, $RHS)>;
-def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOGE)),
-          (FMAXSrr $LHS, $RHS)>;
-def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETGE)),
-          (MAXSLrr $LHS, $RHS)>;
-def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETGE)),
-          (MAXSWSXrr $LHS, $RHS)>;
-
-def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOLT)),
-          (FMINDrr $LHS, $RHS)>;
-def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOLT)),
-          (FMINSrr $LHS, $RHS)>;
-def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETLT)),
-          (MINSLrr $LHS, $RHS)>;
-def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETLT)),
-          (MINSWSXrr $LHS, $RHS)>;
-def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOLE)),
-          (FMINDrr $LHS, $RHS)>;
-def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOLE)),
-          (FMINSrr $LHS, $RHS)>;
-def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETLE)),
-          (MINSLrr $LHS, $RHS)>;
-def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETLE)),
-          (MINSWSXrr $LHS, $RHS)>;
-
 // Helper classes to construct cmov patterns for the ease.
 //
 //   Hiding INSERT_SUBREG/EXTRACT_SUBREG patterns.
diff --git a/llvm/test/Analysis/MustExecute/pr57780.ll b/llvm/test/Analysis/MustExecute/pr57780.ll
new file mode 100644
index 00000000000000..a7b47a1fb8b5f7
--- /dev/null
+++ b/llvm/test/Analysis/MustExecute/pr57780.ll
@@ -0,0 +1,55 @@
+; RUN: opt -disable-output -print-mustexecute < %s 2>&1 | FileCheck %s
+
+@c = global i16 0, align 2
+
+; FIXME: miscompile
+; CHECK-LABEL: define void @latch_cycle_irreducible
+; CHECK: store i16 5, ptr @c, align 2 ; (mustexec in: loop)
+define void @latch_cycle_irreducible() {
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop.latch, %entry
+  %v = phi i32 [ 10, %entry ], [ 0, %loop.latch ]
+  %c = icmp eq i32 %v, 0
+  br i1 %c, label %loop.exit, label %loop.cont
+
+loop.cont:                                        ; preds = %loop
+  br i1 false, label %loop.irreducible, label %loop.latch
+
+loop.irreducible:                                 ; preds = %loop.latch, %loop.cont
+  store i16 5, ptr @c, align 2
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %loop.irreducible, %loop.cont
+  br i1 false, label %loop.irreducible, label %loop
+
+loop.exit:                                        ; preds = %loop
+  ret void
+}
+
+; FIXME: miscompile
+; CHECK-LABEL: define void @latch_cycle_reducible
+; CHECK: store i16 5, ptr @c, align 2 ; (mustexec in: loop)
+define void @latch_cycle_reducible() {
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop.latch, %entry
+  %v = phi i32 [ 10, %entry ], [ 0, %loop.latch ]
+  %c = icmp eq i32 %v, 0
+  br i1 %c, label %loop.exit, label %loop2
+
+loop2:                                            ; preds = %loop.latch, %loop
+  br i1 false, label %loop2.cont, label %loop.latch
+
+loop2.cont:                                       ; preds = %loop2
+  store i16 5, ptr @c, align 2
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %loop2.cont, %loop2
+  br i1 false, label %loop2, label %loop
+
+loop.exit:                                        ; preds = %loop
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
new file mode 100644
index 00000000000000..ed53c77e4c88c5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll
@@ -0,0 +1,37 @@
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-adrp-add | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=generic         | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a55      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a510     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a75      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a76      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a77      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a78      | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a710     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1     | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2     | FileCheck %s
+
+@g = common local_unnamed_addr global i8* null, align 8
+
+define dso_local i8* @addldr(i32 %a, i32 %b) {
+; CHECK-LABEL: addldr:
+; CHECK: adrp [[R:x[0-9]+]], addldr
+; CHECK-NEXT: add {{x[0-9]+}}, [[R]], :lo12:addldr
+entry:
+  %add = add nsw i32 %b, %a
+  %idx.ext = sext i32 %add to i64
+  %add.ptr = getelementptr i8, i8* bitcast (i8* (i32, i32)* @addldr to i8*), i64 %idx.ext
+  store i8* %add.ptr, i8** @g, align 8
+  ret i8* %add.ptr
+}
+
+
+define double @litf() {
+; CHECK-LABEL: litf:
+; CHECK:      adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
+; CHECK-NEXT: ldr  {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
+entry:
+  ret double 0x400921FB54442D18
+}
diff --git a/llvm/test/CodeGen/ARM/and-cmp0-sink.ll b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
new file mode 100644
index 00000000000000..27203e274a4aa6
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/and-cmp0-sink.ll
@@ -0,0 +1,396 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi  %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi  %s -o -   | FileCheck %s --check-prefix V6M
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Test sink of `and` instructions to fold in to `tst`, `lsls`, do cmov-bfi combine, etc.
+define void @f(i32 %v, ptr noalias  %outp) {
+; V7M-LABEL: f:
+; V7M:       @ %bb.0: @ %entry
+; V7M-NEXT:    movs r2, #0
+; V7M-NEXT:    str r2, [r1]
+; V7M-NEXT:    lsls r2, r0, #31
+; V7M-NEXT:    bne .LBB0_3
+; V7M-NEXT:  @ %bb.1: @ %if.then
+; V7M-NEXT:    tst.w r0, #14
+; V7M-NEXT:    beq .LBB0_6
+; V7M-NEXT:  @ %bb.2:
+; V7M-NEXT:    lsls r2, r0, #30
+; V7M-NEXT:    mov.w r3, #33024
+; V7M-NEXT:    and.w r2, r3, r2, asr #31
+; V7M-NEXT:    lsrs r0, r0, #2
+; V7M-NEXT:    bfi r2, r0, #7, #1
+; V7M-NEXT:    bfi r2, r0, #14, #1
+; V7M-NEXT:    b .LBB0_5
+; V7M-NEXT:  .LBB0_3: @ %if.else
+; V7M-NEXT:    tst.w r0, #14
+; V7M-NEXT:    it eq
+; V7M-NEXT:    bxeq lr
+; V7M-NEXT:  .LBB0_4:
+; V7M-NEXT:    lsls r2, r0, #30
+; V7M-NEXT:    mov.w r3, #8256
+; V7M-NEXT:    and.w r2, r3, r2, asr #31
+; V7M-NEXT:    lsrs r0, r0, #2
+; V7M-NEXT:    bfi r2, r0, #5, #1
+; V7M-NEXT:    bfi r2, r0, #12, #1
+; V7M-NEXT:  .LBB0_5: @ %if.end
+; V7M-NEXT:    str r2, [r1]
+; V7M-NEXT:  .LBB0_6: @ %exit
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: f:
+; V7A:       @ %bb.0: @ %entry
+; V7A-NEXT:    mov r2, #0
+; V7A-NEXT:    tst r0, #1
+; V7A-NEXT:    str r2, [r1]
+; V7A-NEXT:    bne .LBB0_3
+; V7A-NEXT:  @ %bb.1: @ %if.then
+; V7A-NEXT:    tst r0, #14
+; V7A-NEXT:    beq .LBB0_6
+; V7A-NEXT:  @ %bb.2:
+; V7A-NEXT:    lsl r2, r0, #30
+; V7A-NEXT:    mov r3, #33024
+; V7A-NEXT:    and r2, r3, r2, asr #31
+; V7A-NEXT:    lsr r0, r0, #2
+; V7A-NEXT:    bfi r2, r0, #7, #1
+; V7A-NEXT:    bfi r2, r0, #14, #1
+; V7A-NEXT:    b .LBB0_5
+; V7A-NEXT:  .LBB0_3: @ %if.else
+; V7A-NEXT:    tst r0, #14
+; V7A-NEXT:    bxeq lr
+; V7A-NEXT:  .LBB0_4:
+; V7A-NEXT:    lsl r2, r0, #30
+; V7A-NEXT:    mov r3, #8256
+; V7A-NEXT:    and r2, r3, r2, asr #31
+; V7A-NEXT:    lsr r0, r0, #2
+; V7A-NEXT:    bfi r2, r0, #5, #1
+; V7A-NEXT:    bfi r2, r0, #12, #1
+; V7A-NEXT:  .LBB0_5: @ %if.end
+; V7A-NEXT:    str r2, [r1]
+; V7A-NEXT:  .LBB0_6: @ %exit
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: f:
+; V7A-T:       @ %bb.0: @ %entry
+; V7A-T-NEXT:    movs r2, #0
+; V7A-T-NEXT:    str r2, [r1]
+; V7A-T-NEXT:    lsls r2, r0, #31
+; V7A-T-NEXT:    bne .LBB0_3
+; V7A-T-NEXT:  @ %bb.1: @ %if.then
+; V7A-T-NEXT:    tst.w r0, #14
+; V7A-T-NEXT:    beq .LBB0_6
+; V7A-T-NEXT:  @ %bb.2:
+; V7A-T-NEXT:    lsls r2, r0, #30
+; V7A-T-NEXT:    mov.w r3, #33024
+; V7A-T-NEXT:    and.w r2, r3, r2, asr #31
+; V7A-T-NEXT:    lsrs r0, r0, #2
+; V7A-T-NEXT:    bfi r2, r0, #7, #1
+; V7A-T-NEXT:    bfi r2, r0, #14, #1
+; V7A-T-NEXT:    b .LBB0_5
+; V7A-T-NEXT:  .LBB0_3: @ %if.else
+; V7A-T-NEXT:    tst.w r0, #14
+; V7A-T-NEXT:    it eq
+; V7A-T-NEXT:    bxeq lr
+; V7A-T-NEXT:  .LBB0_4:
+; V7A-T-NEXT:    lsls r2, r0, #30
+; V7A-T-NEXT:    mov.w r3, #8256
+; V7A-T-NEXT:    and.w r2, r3, r2, asr #31
+; V7A-T-NEXT:    lsrs r0, r0, #2
+; V7A-T-NEXT:    bfi r2, r0, #5, #1
+; V7A-T-NEXT:    bfi r2, r0, #12, #1
+; V7A-T-NEXT:  .LBB0_5: @ %if.end
+; V7A-T-NEXT:    str r2, [r1]
+; V7A-T-NEXT:  .LBB0_6: @ %exit
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: f:
+; V6M:       @ %bb.0: @ %entry
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    str r2, [r1]
+; V6M-NEXT:    movs r3, #14
+; V6M-NEXT:    ands r3, r0
+; V6M-NEXT:    movs r4, #4
+; V6M-NEXT:    ands r4, r0
+; V6M-NEXT:    movs r2, #2
+; V6M-NEXT:    ands r2, r0
+; V6M-NEXT:    lsls r0, r0, #31
+; V6M-NEXT:    bne .LBB0_5
+; V6M-NEXT:  @ %bb.1: @ %if.then
+; V6M-NEXT:    movs r0, #129
+; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    beq .LBB0_3
+; V6M-NEXT:  @ %bb.2:
+; V6M-NEXT:    lsls r2, r0, #8
+; V6M-NEXT:  .LBB0_3: @ %if.then
+; V6M-NEXT:    cmp r4, #0
+; V6M-NEXT:    beq .LBB0_10
+; V6M-NEXT:  @ %bb.4: @ %if.then
+; V6M-NEXT:    lsls r0, r0, #7
+; V6M-NEXT:    b .LBB0_9
+; V6M-NEXT:  .LBB0_5: @ %if.else
+; V6M-NEXT:    movs r0, #129
+; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    beq .LBB0_7
+; V6M-NEXT:  @ %bb.6:
+; V6M-NEXT:    lsls r2, r0, #6
+; V6M-NEXT:  .LBB0_7: @ %if.else
+; V6M-NEXT:    cmp r4, #0
+; V6M-NEXT:    beq .LBB0_10
+; V6M-NEXT:  @ %bb.8: @ %if.else
+; V6M-NEXT:    lsls r0, r0, #5
+; V6M-NEXT:  .LBB0_9: @ %if.else
+; V6M-NEXT:    adds r2, r2, r0
+; V6M-NEXT:  .LBB0_10: @ %if.else
+; V6M-NEXT:    cmp r3, #0
+; V6M-NEXT:    beq .LBB0_12
+; V6M-NEXT:  @ %bb.11: @ %if.end
+; V6M-NEXT:    str r2, [r1]
+; V6M-NEXT:  .LBB0_12: @ %exit
+; V6M-NEXT:    pop {r4, pc}
+entry:
+  store i32 0, ptr %outp, align 4
+  %and = and i32 %v, 1
+  %cmp = icmp eq i32 %and, 0
+  %and1 = and i32 %v, 2
+  %tobool.not = icmp eq i32 %and1, 0
+  %and2 = and i32 %v, 4
+  %tobool1.not = icmp eq i32 %and2, 0
+  %and3 = and i32 %v, 14
+  %tobool2.not = icmp eq i32 %and3, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %select = select i1 %tobool.not, i32 0, i32 33024
+  %or = or i32 %select, 16512
+  %spec.select = select i1 %tobool1.not, i32 %select, i32 %or
+  br i1 %tobool2.not, label %exit, label %if.end
+
+if.else:
+  %select1 = select i1 %tobool.not, i32 0, i32 8256
+  %or1 = or i32 %select1, 4128
+  %spec.select1 = select i1 %tobool1.not, i32 %select1, i32 %or1
+  br i1 %tobool2.not, label %exit, label %if.end
+
+if.end:
+  %spec.select.sink = phi i32 [ %spec.select, %if.then ], [ %spec.select1, %if.else ]
+  store i32 %spec.select.sink, ptr %outp, align 4
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test with a mask that can be encoded with T32 instruction set, but not with A32.
+define i32 @f0(i1 %c0, i32 %v) {
+; V7M-LABEL: f0:
+; V7M:       @ %bb.0: @ %E
+; V7M-NEXT:    lsls r0, r0, #31
+; V7M-NEXT:    beq .LBB1_2
+; V7M-NEXT:  @ %bb.1: @ %A
+; V7M-NEXT:    tst.w r1, #16843009
+; V7M-NEXT:    itt eq
+; V7M-NEXT:    moveq r0, #0
+; V7M-NEXT:    bxeq lr
+; V7M-NEXT:    b .LBB1_3
+; V7M-NEXT:  .LBB1_2: @ %B
+; V7M-NEXT:    tst.w r1, #16843009
+; V7M-NEXT:    itt ne
+; V7M-NEXT:    movne r0, #0
+; V7M-NEXT:    bxne lr
+; V7M-NEXT:  .LBB1_3: @ %D
+; V7M-NEXT:    movs r0, #1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: f0:
+; V7A:       @ %bb.0: @ %E
+; V7A-NEXT:    movw r2, #257
+; V7A-NEXT:    tst r0, #1
+; V7A-NEXT:    movt r2, #257
+; V7A-NEXT:    and r1, r1, r2
+; V7A-NEXT:    beq .LBB1_3
+; V7A-NEXT:  @ %bb.1: @ %A
+; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    moveq r0, #0
+; V7A-NEXT:    bxeq lr
+; V7A-NEXT:  .LBB1_2: @ %D
+; V7A-NEXT:    mov r0, #1
+; V7A-NEXT:    bx lr
+; V7A-NEXT:  .LBB1_3: @ %B
+; V7A-NEXT:    mov r0, #0
+; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    moveq r0, #1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: f0:
+; V7A-T:       @ %bb.0: @ %E
+; V7A-T-NEXT:    lsls r0, r0, #31
+; V7A-T-NEXT:    beq .LBB1_2
+; V7A-T-NEXT:  @ %bb.1: @ %A
+; V7A-T-NEXT:    tst.w r1, #16843009
+; V7A-T-NEXT:    itt eq
+; V7A-T-NEXT:    moveq r0, #0
+; V7A-T-NEXT:    bxeq lr
+; V7A-T-NEXT:    b .LBB1_3
+; V7A-T-NEXT:  .LBB1_2: @ %B
+; V7A-T-NEXT:    tst.w r1, #16843009
+; V7A-T-NEXT:    itt ne
+; V7A-T-NEXT:    movne r0, #0
+; V7A-T-NEXT:    bxne lr
+; V7A-T-NEXT:  .LBB1_3: @ %D
+; V7A-T-NEXT:    movs r0, #1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: f0:
+; V6M:       @ %bb.0: @ %E
+; V6M-NEXT:    ldr r2, .LCPI1_0
+; V6M-NEXT:    ands r2, r1
+; V6M-NEXT:    lsls r0, r0, #31
+; V6M-NEXT:    beq .LBB1_3
+; V6M-NEXT:  @ %bb.1: @ %A
+; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    bne .LBB1_5
+; V6M-NEXT:  @ %bb.2:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:  .LBB1_3: @ %B
+; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    beq .LBB1_5
+; V6M-NEXT:  @ %bb.4:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:  .LBB1_5: @ %D
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.6:
+; V6M-NEXT:  .LCPI1_0:
+; V6M-NEXT:    .long 16843009 @ 0x1010101
+E:
+  %a = and i32 %v, 16843009
+  br i1 %c0, label %A, label %B
+
+A:
+  %c1 = icmp eq i32 %a, 0
+  br i1 %c1, label %C, label %D
+
+B:
+  %c2 = icmp eq i32 %a, 0
+  br i1 %c2, label %D, label %C
+
+C:
+  br label %X
+
+D:
+  br label %X
+
+X:
+  %x = phi i32 [0, %C], [1, %D]
+  ret i32 %x
+}
+
+; Test with a mask that can be encoded both with T32 and A32 instruction sets.
+define i32 @f1(i1 %c0, i32 %v) {
+; V7M-LABEL: f1:
+; V7M:       @ %bb.0: @ %E
+; V7M-NEXT:    lsls r0, r0, #31
+; V7M-NEXT:    beq .LBB2_2
+; V7M-NEXT:  @ %bb.1: @ %A
+; V7M-NEXT:    tst.w r1, #100663296
+; V7M-NEXT:    itt eq
+; V7M-NEXT:    moveq r0, #0
+; V7M-NEXT:    bxeq lr
+; V7M-NEXT:    b .LBB2_3
+; V7M-NEXT:  .LBB2_2: @ %B
+; V7M-NEXT:    tst.w r1, #100663296
+; V7M-NEXT:    itt ne
+; V7M-NEXT:    movne r0, #0
+; V7M-NEXT:    bxne lr
+; V7M-NEXT:  .LBB2_3: @ %D
+; V7M-NEXT:    movs r0, #1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: f1:
+; V7A:       @ %bb.0: @ %E
+; V7A-NEXT:    tst r0, #1
+; V7A-NEXT:    beq .LBB2_3
+; V7A-NEXT:  @ %bb.1: @ %A
+; V7A-NEXT:    tst r1, #100663296
+; V7A-NEXT:    moveq r0, #0
+; V7A-NEXT:    bxeq lr
+; V7A-NEXT:  .LBB2_2: @ %D
+; V7A-NEXT:    mov r0, #1
+; V7A-NEXT:    bx lr
+; V7A-NEXT:  .LBB2_3: @ %B
+; V7A-NEXT:    mov r0, #0
+; V7A-NEXT:    tst r1, #100663296
+; V7A-NEXT:    moveq r0, #1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: f1:
+; V7A-T:       @ %bb.0: @ %E
+; V7A-T-NEXT:    lsls r0, r0, #31
+; V7A-T-NEXT:    beq .LBB2_2
+; V7A-T-NEXT:  @ %bb.1: @ %A
+; V7A-T-NEXT:    tst.w r1, #100663296
+; V7A-T-NEXT:    itt eq
+; V7A-T-NEXT:    moveq r0, #0
+; V7A-T-NEXT:    bxeq lr
+; V7A-T-NEXT:    b .LBB2_3
+; V7A-T-NEXT:  .LBB2_2: @ %B
+; V7A-T-NEXT:    tst.w r1, #100663296
+; V7A-T-NEXT:    itt ne
+; V7A-T-NEXT:    movne r0, #0
+; V7A-T-NEXT:    bxne lr
+; V7A-T-NEXT:  .LBB2_3: @ %D
+; V7A-T-NEXT:    movs r0, #1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: f1:
+; V6M:       @ %bb.0: @ %E
+; V6M-NEXT:    movs r2, #3
+; V6M-NEXT:    lsls r2, r2, #25
+; V6M-NEXT:    ands r2, r1
+; V6M-NEXT:    lsls r0, r0, #31
+; V6M-NEXT:    beq .LBB2_3
+; V6M-NEXT:  @ %bb.1: @ %A
+; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    bne .LBB2_5
+; V6M-NEXT:  @ %bb.2:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:  .LBB2_3: @ %B
+; V6M-NEXT:    cmp r2, #0
+; V6M-NEXT:    beq .LBB2_5
+; V6M-NEXT:  @ %bb.4:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:  .LBB2_5: @ %D
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    bx lr
+E:
+  %a = and i32 %v, 100663296
+  br i1 %c0, label %A, label %B
+
+A:
+  %c1 = icmp eq i32 %a, 0
+  br i1 %c1, label %C, label %D
+
+B:
+  %c2 = icmp eq i32 %a, 0
+  br i1 %c2, label %D, label %C
+
+C:
+  br label %X
+
+D:
+  br label %X
+
+X:
+  %x = phi i32 [0, %C], [1, %D]
+  ret i32 %x
+}
diff --git a/llvm/test/CodeGen/LoongArch/double-fcmp-strict.ll b/llvm/test/CodeGen/LoongArch/double-fcmp-strict.ll
new file mode 100644
index 00000000000000..066f60752e2a75
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/double-fcmp-strict.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+d < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+d < %s | FileCheck %s --check-prefix=LA64
+
+declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
+
+define i32 @fcmp_oeq(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oeq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.ceq.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oeq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.ceq.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oeq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ogt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ogt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ogt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ogt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_oge(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_olt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_olt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_olt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"olt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ole(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ole:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ole:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ole", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_one:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cne.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_one:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cne.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"one", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ord(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ord:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cor.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ord:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cor.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ord", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ueq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cueq.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ueq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cueq.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ugt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ugt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ult:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ult:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ule:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ule:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_une(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_une:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cune.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_une:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cune.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"une", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uno(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uno:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cun.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uno:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cun.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"uno", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/double-fcmps-strict.ll b/llvm/test/CodeGen/LoongArch/double-fcmps-strict.ll
new file mode 100644
index 00000000000000..c8974fb946222a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/double-fcmps-strict.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+d < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+d < %s | FileCheck %s --check-prefix=LA64
+
+declare i1 @llvm.experimental.constrained.fcmps.f64(double, double, metadata, metadata)
+declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
+
+define i32 @fcmps_oeq(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_oeq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.seq.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_oeq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.seq.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"oeq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ogt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ogt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.slt.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ogt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.slt.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ogt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_oge(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_oge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sle.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_oge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sle.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"oge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_olt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_olt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.slt.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_olt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.slt.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"olt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ole(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ole:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sle.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ole:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sle.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ole", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_one(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_one:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sne.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_one:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sne.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"one", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ord(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ord:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sor.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ord:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sor.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ord", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ueq(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ueq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sueq.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ueq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sueq.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ugt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ugt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sult.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ugt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sult.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_uge(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_uge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sule.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_uge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sule.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ult(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ult:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sult.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ult:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sult.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ule(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ule:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sule.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ule:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sule.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_une(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_une:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sune.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_une:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sune.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"une", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_uno(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmps_uno:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sun.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_uno:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sun.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"uno", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_oeq(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oeq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.ceq.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oeq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.ceq.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oeq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ogt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ogt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ogt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ogt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_oge(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_olt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_olt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_olt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"olt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ole(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ole:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ole:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ole", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_one(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_one:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cne.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_one:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cne.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"one", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ord(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ord:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cor.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ord:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cor.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ord", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ueq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cueq.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ueq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cueq.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ugt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ugt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uge(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.d $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.d $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ult(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ult:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ult:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ule(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ule:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ule:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_une(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_une:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cune.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_une:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cune.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"une", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uno(double %a, double %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uno:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cun.d $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uno:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cun.d $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"uno", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/float-fcmp-strict.ll b/llvm/test/CodeGen/LoongArch/float-fcmp-strict.ll
new file mode 100644
index 00000000000000..0459d5019378f2
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/float-fcmp-strict.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA64
+
+declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
+
+define i32 @fcmp_oeq(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oeq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.ceq.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oeq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.ceq.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oeq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ogt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ogt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ogt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ogt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_oge(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_olt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_olt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_olt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"olt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ole(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ole:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ole:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ole", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_one(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_one:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cne.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_one:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cne.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"one", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ord(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ord:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cor.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ord:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cor.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ord", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ueq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cueq.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ueq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cueq.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ugt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ugt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ult:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ult:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ule(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ule:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ule:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_une(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_une:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cune.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_une:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cune.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"une", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uno(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uno:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cun.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uno:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cun.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"uno", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/float-fcmps-strict.ll b/llvm/test/CodeGen/LoongArch/float-fcmps-strict.ll
new file mode 100644
index 00000000000000..cad4d45c147ee4
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/float-fcmps-strict.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA64
+
+declare i1 @llvm.experimental.constrained.fcmps.f32(float, float, metadata, metadata)
+declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
+
+define i32 @fcmps_oeq(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_oeq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.seq.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_oeq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.seq.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"oeq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ogt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ogt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.slt.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ogt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.slt.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ogt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_oge(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_oge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sle.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_oge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sle.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"oge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_olt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_olt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.slt.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_olt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.slt.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"olt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ole(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ole:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sle.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ole:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sle.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ole", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_one(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_one:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sne.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_one:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sne.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"one", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ord(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ord:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sor.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ord:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sor.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ord", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ueq(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ueq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sueq.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ueq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sueq.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ugt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ugt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sult.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ugt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sult.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_uge(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_uge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sule.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_uge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sule.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ult(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ult:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sult.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ult:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sult.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_ule(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_ule:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sule.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_ule:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sule.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_une(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_une:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sune.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_une:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sune.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"une", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmps_uno(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmps_uno:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.sun.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmps_uno:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.sun.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"uno", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_oeq(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oeq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.ceq.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oeq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.ceq.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oeq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ogt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ogt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ogt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ogt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_oge(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_oge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_oge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_olt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_olt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_olt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"olt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ole(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ole:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cle.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ole:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cle.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ole", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_one(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_one:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cne.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_one:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cne.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"one", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ord(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ord:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cor.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ord:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cor.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ord", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ueq:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cueq.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ueq:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cueq.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ugt:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ugt:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uge(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uge:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.s $fcc0, $fa1, $fa0
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uge:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.s $fcc0, $fa1, $fa0
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"uge", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ult(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ult:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cult.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ult:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cult.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ult", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_ule(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_ule:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cule.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_ule:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cule.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"ule", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_une(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_une:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cune.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_une:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cune.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"une", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @fcmp_uno(float %a, float %b) nounwind strictfp {
+; LA32-LABEL: fcmp_uno:
+; LA32:       # %bb.0:
+; LA32-NEXT:    fcmp.cun.s $fcc0, $fa0, $fa1
+; LA32-NEXT:    movcf2gr $a0, $fcc0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: fcmp_uno:
+; LA64:       # %bb.0:
+; LA64-NEXT:    fcmp.cun.s $fcc0, $fa0, $fa1
+; LA64-NEXT:    movcf2gr $a0, $fcc0
+; LA64-NEXT:    ret
+  %1 = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"uno", metadata !"fpexcept.strict") strictfp
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index c089034f932655..f25ef272e483b1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1426,12 +1426,11 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    ldrd r12, r6, [r0, #4]
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    and r8, r3, #1
+; CHECK-NEXT:    ldrd r12, r6, [r0, #4]
+; CHECK-NEXT:    ldrb.w r9, [r0]
 ; CHECK-NEXT:    vldr.16 s0, .LCPI17_0
-; CHECK-NEXT:    lsr.w r9, r3, #1
+; CHECK-NEXT:    lsr.w r8, r3, #1
 ; CHECK-NEXT:    b .LBB17_3
 ; CHECK-NEXT:  .LBB17_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
@@ -1441,7 +1440,7 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
 ; CHECK-NEXT:    vstr.16 s5, [r12, #2]
 ; CHECK-NEXT:    adds r6, #10
-; CHECK-NEXT:    subs r0, #1
+; CHECK-NEXT:    subs.w r9, r9, #1
 ; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    beq .LBB17_8
@@ -1458,7 +1457,7 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    vldrh.u16 q1, [r12]
 ; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    mov r5, r2
-; CHECK-NEXT:    wls lr, r9, .LBB17_6
+; CHECK-NEXT:    wls lr, r8, .LBB17_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
 ; CHECK-NEXT:    mov r5, r2
@@ -1466,7 +1465,7 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    @ Parent Loop BB17_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r7, [r1], #4
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vfma.f16 q1, q2, r7
 ; CHECK-NEXT:    ldrh r4, [r1, #-2]
 ; CHECK-NEXT:    vmov.u16 r7, q1[0]
@@ -1478,19 +1477,19 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    strh r4, [r5, #2]
 ; CHECK-NEXT:    vmov.f32 s4, s5
 ; CHECK-NEXT:    strh r7, [r5], #4
-; CHECK-NEXT:    vmov.16 q1[2], r3
+; CHECK-NEXT:    vmov.16 q1[2], r0
 ; CHECK-NEXT:    le lr, .LBB17_5
 ; CHECK-NEXT:  .LBB17_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    lsls r0, r3, #31
 ; CHECK-NEXT:    beq .LBB17_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
-; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vfma.f16 q1, q2, r1
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vfma.f16 q1, q3, r1
-; CHECK-NEXT:    strh r1, [r5]
+; CHECK-NEXT:    ldrh r0, [r1]
+; CHECK-NEXT:    vfma.f16 q1, q2, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vfma.f16 q1, q3, r0
+; CHECK-NEXT:    strh r0, [r5]
 ; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vstr.16 s2, [r12]
 ; CHECK-NEXT:    b .LBB17_2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index ee91dcce9a7c8a..aff4bb32901f9f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -2015,9 +2015,8 @@ define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    ldrd r12, r6, [r0, #4]
-; CHECK-NEXT:    and r8, r3, #1
+; CHECK-NEXT:    lsr.w r8, r3, #1
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    lsrs r3, r3, #1
 ; CHECK-NEXT:    vldr s0, .LCPI20_0
 ; CHECK-NEXT:    b .LBB20_3
 ; CHECK-NEXT:  .LBB20_1: @ %if.else
@@ -2046,7 +2045,7 @@ define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    wls lr, r3, .LBB20_6
+; CHECK-NEXT:    wls lr, r8, .LBB20_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
 ; CHECK-NEXT:    vmov q6, q1
@@ -2073,7 +2072,7 @@ define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    le lr, .LBB20_5
 ; CHECK-NEXT:  .LBB20_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    cmp.w r8, #0
+; CHECK-NEXT:    lsls r7, r3, #31
 ; CHECK-NEXT:    beq .LBB20_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
diff --git a/llvm/test/CodeGen/VE/Scalar/atomic.ll b/llvm/test/CodeGen/VE/Scalar/atomic.ll
index f01bf0ff66c3f9..405cdc3f369376 100644
--- a/llvm/test/CodeGen/VE/Scalar/atomic.ll
+++ b/llvm/test/CodeGen/VE/Scalar/atomic.ll
@@ -184,13 +184,12 @@ define signext i32 @test_atomic_fetch_max_4() {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    lea.sl %s1, i@hi(, %s0)
 ; CHECK-NEXT:    ldl.sx %s0, (, %s1)
-; CHECK-NEXT:    or %s2, 1, (0)1
 ; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or %s3, 0, %s0
-; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s2
-; CHECK-NEXT:    cas.w %s0, (%s1), %s3
-; CHECK-NEXT:    brne.w %s0, %s3, .LBB6_1
+; CHECK-NEXT:    or %s2, 0, %s0
+; CHECK-NEXT:    maxs.w.sx %s0, 1, %s0
+; CHECK-NEXT:    cas.w %s0, (%s1), %s2
+; CHECK-NEXT:    brne.w %s0, %s2, .LBB6_1
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    fencem 3
@@ -207,19 +206,16 @@ define signext i32 @test_atomic_fetch_min_4() {
 ; CHECK-NEXT:    fencem 3
 ; CHECK-NEXT:    lea %s0, i@lo
 ; CHECK-NEXT:    and %s0, %s0, (32)0
-; CHECK-NEXT:    lea.sl %s0, i@hi(, %s0)
-; CHECK-NEXT:    ldl.sx %s1, (, %s0)
-; CHECK-NEXT:    or %s2, 2, (0)1
+; CHECK-NEXT:    lea.sl %s1, i@hi(, %s0)
+; CHECK-NEXT:    ldl.sx %s0, (, %s1)
 ; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    or %s3, 0, %s1
-; CHECK-NEXT:    cmps.w.sx %s4, %s1, %s2
-; CHECK-NEXT:    or %s1, 1, (0)1
-; CHECK-NEXT:    cmov.w.lt %s1, %s3, %s4
-; CHECK-NEXT:    cas.w %s1, (%s0), %s3
-; CHECK-NEXT:    brne.w %s1, %s3, .LBB7_1
+; CHECK-NEXT:    or %s2, 0, %s0
+; CHECK-NEXT:    mins.w.sx %s0, 1, %s0
+; CHECK-NEXT:    cas.w %s0, (%s1), %s2
+; CHECK-NEXT:    brne.w %s0, %s2, .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
-; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    fencem 3
 ; CHECK-NEXT:    b.l.t (, %s10)
 entry:
diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll
index 5b2834ef087313..12aa101cb48c4d 100644
--- a/llvm/test/CodeGen/VE/Scalar/max.ll
+++ b/llvm/test/CodeGen/VE/Scalar/max.ll
@@ -1,11 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=ve-unknown-unknown -enable-no-signed-zeros-fp-math \
+; RUN:     -enable-no-nans-fp-math | FileCheck %s -check-prefix=OPT
 
 define double @maxf64(double, double) {
 ; CHECK-LABEL: maxf64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.d %s2, %s0, %s1
+; CHECK-NEXT:    cmov.d.gt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxf64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ogt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -14,8 +23,15 @@ define double @maxf64(double, double) {
 define double @max2f64(double, double) {
 ; CHECK-LABEL: max2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.d %s2, %s0, %s1
+; CHECK-NEXT:    cmov.d.ge %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2f64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp oge double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -29,6 +45,11 @@ define double @maxuf64(double, double) {
 ; CHECK-NEXT:    cmov.d.gtnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxuf64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ugt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -42,6 +63,11 @@ define double @max2uf64(double, double) {
 ; CHECK-NEXT:    cmov.d.genan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2uf64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp uge double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -50,8 +76,15 @@ define double @max2uf64(double, double) {
 define float @maxf32(float, float) {
 ; CHECK-LABEL: maxf32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.s %s2, %s0, %s1
+; CHECK-NEXT:    cmov.s.gt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxf32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ogt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -60,8 +93,15 @@ define float @maxf32(float, float) {
 define float @max2f32(float, float) {
 ; CHECK-LABEL: max2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.s %s2, %s0, %s1
+; CHECK-NEXT:    cmov.s.ge %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2f32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp oge float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -74,6 +114,11 @@ define float @maxuf32(float, float) {
 ; CHECK-NEXT:    cmov.s.gtnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxuf32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ugt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -86,6 +131,11 @@ define float @max2uf32(float, float) {
 ; CHECK-NEXT:    cmov.s.genan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2uf32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmax.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp uge float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -96,6 +146,11 @@ define i64 @maxi64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxi64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    maxs.l %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sgt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -106,6 +161,11 @@ define i64 @max2i64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2i64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    maxs.l %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sge i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -118,6 +178,13 @@ define i64 @maxu64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxu64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.l %s2, %s0, %s1
+; OPT-NEXT:    cmov.l.gt %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ugt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -130,6 +197,13 @@ define i64 @max2u64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2u64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.l %s2, %s0, %s1
+; OPT-NEXT:    cmov.l.ge %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp uge i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -140,6 +214,11 @@ define i32 @maxi32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxi32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    maxs.w.sx %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sgt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -150,6 +229,11 @@ define i32 @max2i32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2i32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    maxs.w.sx %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sge i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -162,6 +246,13 @@ define i32 @maxu32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxu32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.w %s2, %s0, %s1
+; OPT-NEXT:    cmov.w.gt %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ugt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -174,6 +265,13 @@ define i32 @max2u32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: max2u32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.w %s2, %s0, %s1
+; OPT-NEXT:    cmov.w.ge %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp uge i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -184,6 +282,11 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    or %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: maxi1:
+; OPT:       # %bb.0:
+; OPT-NEXT:    or %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %1, true
   %4 = and i1 %3, %0
   %5 = select i1 %4, i1 %0, i1 %1
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index 866a5d6c2b914a..da92ebafd05903 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -1,10 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=ve-unknown-unknown -enable-no-signed-zeros-fp-math \
+; RUN:     -enable-no-nans-fp-math | FileCheck %s -check-prefix=OPT
 
 define double @minf64(double, double) {
 ; CHECK-LABEL: minf64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.d %s2, %s0, %s1
+; CHECK-NEXT:    cmov.d.lt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: minf64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp olt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -13,8 +23,15 @@ define double @minf64(double, double) {
 define double @min2f64(double, double) {
 ; CHECK-LABEL: min2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.d %s2, %s0, %s1
+; CHECK-NEXT:    cmov.d.le %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2f64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ole double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -27,6 +44,11 @@ define double @minuf64(double, double) {
 ; CHECK-NEXT:    cmov.d.ltnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: minuf64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ult double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -39,6 +61,11 @@ define double @min2uf64(double, double) {
 ; CHECK-NEXT:    cmov.d.lenan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2uf64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.d %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ule double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
@@ -47,8 +74,15 @@ define double @min2uf64(double, double) {
 define float @minf32(float, float) {
 ; CHECK-LABEL: minf32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.s %s2, %s0, %s1
+; CHECK-NEXT:    cmov.s.lt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: minf32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp olt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -57,8 +91,15 @@ define float @minf32(float, float) {
 define float @min2f32(float, float) {
 ; CHECK-LABEL: min2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    fcmp.s %s2, %s0, %s1
+; CHECK-NEXT:    cmov.s.le %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2f32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ole float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -71,6 +112,11 @@ define float @minuf32(float, float) {
 ; CHECK-NEXT:    cmov.s.ltnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: minuf32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ult float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -83,6 +129,11 @@ define float @min2uf32(float, float) {
 ; CHECK-NEXT:    cmov.s.lenan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2uf32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    fmin.s %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ule float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
@@ -93,6 +144,11 @@ define i64 @mini64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: mini64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    mins.l %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp slt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -103,6 +159,11 @@ define i64 @min2i64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2i64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    mins.l %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sle i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -115,6 +176,13 @@ define i64 @minu64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: minu64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.l %s2, %s0, %s1
+; OPT-NEXT:    cmov.l.lt %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ult i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -127,6 +195,13 @@ define i64 @min2u64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2u64:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.l %s2, %s0, %s1
+; OPT-NEXT:    cmov.l.le %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ule i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -137,6 +212,11 @@ define i32 @mini32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: mini32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    mins.w.sx %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp slt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -147,6 +227,11 @@ define i32 @min2i32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2i32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    mins.w.sx %s0, %s0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sle i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -159,6 +244,13 @@ define i32 @minu32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: minu32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.w %s2, %s0, %s1
+; OPT-NEXT:    cmov.w.lt %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ult i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -171,6 +263,13 @@ define i32 @min2u32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: min2u32:
+; OPT:       # %bb.0:
+; OPT-NEXT:    cmpu.w %s2, %s0, %s1
+; OPT-NEXT:    cmov.w.le %s1, %s0, %s2
+; OPT-NEXT:    or %s0, 0, %s1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ule i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -183,6 +282,13 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ; CHECK-NEXT:    cmov.w.ne %s2, %s1, %s0
 ; CHECK-NEXT:    adds.w.zx %s0, %s2, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+;
+; OPT-LABEL: mini1:
+; OPT:       # %bb.0:
+; OPT-NEXT:    and %s2, %s1, %s0
+; OPT-NEXT:    cmov.w.ne %s2, %s1, %s0
+; OPT-NEXT:    adds.w.zx %s0, %s2, (0)1
+; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %0, true
   %4 = and i1 %3, %1
   %5 = select i1 %4, i1 %0, i1 %1
diff --git a/llvm/test/CodeGen/VE/Scalar/smax.ll b/llvm/test/CodeGen/VE/Scalar/smax.ll
new file mode 100644
index 00000000000000..f989e0434b59d7
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/smax.ll
@@ -0,0 +1,337 @@
+; RUN: llc < %s -mtriple=ve | FileCheck %s
+
+;;; Test ‘llvm.smax.*’ intrinsic
+;;;
+;;; Syntax:
+;;;   This is an overloaded intrinsic. You can use @llvm.smax on any
+;;;   integer bit width or any vector of integer elements.
+;;;
+;;; declare i32 @llvm.smax.i32(i32 %a, i32 %b)
+;;; declare <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+;;;
+;;; Overview:
+;;;   Return the larger of %a and %b comparing the values as signed
+;;;   integers. Vector intrinsics operate on a per-element basis.
+;;;   The larger element of %a and %b at a given index is returned
+;;;   for that index.
+;;;
+;;; Arguments:
+;;;   The arguments (%a and %b) may be of any integer type or a vector
+;;;   with integer element type. The argument types must match each
+;;;   other, and the return type must match the argument type.
+;;;
+;;; Note:
+;;;   We test only i8/i16/i32/i64/i128.
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smax_var_i8(i8 noundef signext %0, i8 noundef signext %1) {
+; CHECK-LABEL: func_smax_var_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i8 @llvm.smax.i8(i8 %0, i8 %1)
+  ret i8 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smax_var_i16(i16 noundef signext %0, i16 noundef signext %1) {
+; CHECK-LABEL: func_smax_var_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i16 @llvm.smax.i16(i16 %0, i16 %1)
+  ret i16 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smax_var_i32(i32 noundef signext %0, i32 noundef signext %1) {
+; CHECK-LABEL: func_smax_var_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i32 @llvm.smax.i32(i32 %0, i32 %1)
+  ret i32 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smax_var_i64(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: func_smax_var_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.l %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i64 @llvm.smax.i64(i64 %0, i64 %1)
+  ret i64 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smax_var_i128(i128 noundef %0, i128 noundef %1) {
+; CHECK-LABEL: func_smax_var_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s5, %s1, %s3
+; CHECK-NEXT:    or %s4, 0, %s2
+; CHECK-NEXT:    cmov.l.gt %s4, %s0, %s5
+; CHECK-NEXT:    cmpu.l %s6, %s0, %s2
+; CHECK-NEXT:    cmov.l.gt %s2, %s0, %s6
+; CHECK-NEXT:    cmov.l.eq %s4, %s2, %s5
+; CHECK-NEXT:    maxs.l %s1, %s1, %s3
+; CHECK-NEXT:    or %s0, 0, %s4
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i128 @llvm.smax.i128(i128 %0, i128 %1)
+  ret i128 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smax_fore_zero_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smax_fore_zero_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smax.i8(i8 %0, i8 0)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smax_fore_zero_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smax_fore_zero_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smax.i16(i16 %0, i16 0)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smax_fore_zero_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smax_fore_zero_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smax.i32(i32 %0, i32 0)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smax_fore_zero_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smax_fore_zero_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.l %s0, 0, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smax.i64(i64 %0, i64 0)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smax_fore_zero_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smax_fore_zero_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s2, 0, (0)1
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    cmov.l.gt %s2, %s0, %s3
+; CHECK-NEXT:    cmov.l.eq %s2, %s0, %s3
+; CHECK-NEXT:    maxs.l %s1, 0, %s1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smax.i128(i128 %0, i128 0)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smax_back_zero_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smax_back_zero_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smax.i8(i8 %0, i8 0)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smax_back_zero_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smax_back_zero_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smax.i16(i16 %0, i16 0)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smax_back_zero_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smax_back_zero_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smax.i32(i32 %0, i32 0)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smax_back_zero_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smax_back_zero_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.l %s0, 0, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smax.i64(i64 %0, i64 0)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smax_back_zero_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smax_back_zero_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s2, 0, (0)1
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    cmov.l.gt %s2, %s0, %s3
+; CHECK-NEXT:    cmov.l.eq %s2, %s0, %s3
+; CHECK-NEXT:    maxs.l %s1, 0, %s1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smax.i128(i128 %0, i128 0)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smax_fore_const_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smax_fore_const_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, -1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smax.i8(i8 %0, i8 -1)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smax_fore_const_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smax_fore_const_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smax.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smax_fore_const_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smax_fore_const_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smax.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smax_fore_const_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smax_fore_const_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.l %s0, %s0, (56)0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smax.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smax_fore_const_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smax_fore_const_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmov.l.gt %s2, %s0, %s3
+; CHECK-NEXT:    cmpu.l %s5, %s0, (56)0
+; CHECK-NEXT:    cmov.l.gt %s4, %s0, %s5
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s3
+; CHECK-NEXT:    maxs.l %s1, 0, %s1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smax.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smax_back_const_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smax_back_const_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, -1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smax.i8(i8 %0, i8 -1)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smax_back_const_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smax_back_const_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smax.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smax_back_const_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smax_back_const_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smax.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smax_back_const_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smax_back_const_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.l %s0, %s0, (56)0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smax.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smax_back_const_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smax_back_const_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmov.l.gt %s2, %s0, %s3
+; CHECK-NEXT:    cmpu.l %s5, %s0, (56)0
+; CHECK-NEXT:    cmov.l.gt %s4, %s0, %s5
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s3
+; CHECK-NEXT:    maxs.l %s1, 0, %s1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smax.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i8 @llvm.smax.i8(i8, i8)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i16 @llvm.smax.i16(i16, i16)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i64 @llvm.smax.i64(i64, i64)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i128 @llvm.smax.i128(i128, i128)
diff --git a/llvm/test/CodeGen/VE/Scalar/smin.ll b/llvm/test/CodeGen/VE/Scalar/smin.ll
new file mode 100644
index 00000000000000..a46c4e19a4ea82
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/smin.ll
@@ -0,0 +1,337 @@
+; RUN: llc < %s -mtriple=ve | FileCheck %s
+
+;;; Test ‘llvm.smin.*’ intrinsic
+;;;
+;;; Syntax:
+;;;   This is an overloaded intrinsic. You can use @llvm.smin on any
+;;;   integer bit width or any vector of integer elements.
+;;;
+;;; declare i32 @llvm.smin.i32(i32 %a, i32 %b)
+;;; declare <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
+;;;
+;;; Overview:
+;;;   Return the smaller of %a and %b comparing the values as signed
+;;;   integers. Vector intrinsics operate on a per-element basis.
+;;;   The smaller element of %a and %b at a given index is returned
+;;;   for that index.
+;;;
+;;; Arguments:
+;;;   The arguments (%a and %b) may be of any integer type or a vector
+;;;   with integer element type. The argument types must match each
+;;;   other, and the return type must match the argument type.
+;;;
+;;; Note:
+;;;   We test only i8/i16/i32/i64/i128.
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smin_var_i8(i8 noundef signext %0, i8 noundef signext %1) {
+; CHECK-LABEL: func_smin_var_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i8 @llvm.smin.i8(i8 %0, i8 %1)
+  ret i8 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smin_var_i16(i16 noundef signext %0, i16 noundef signext %1) {
+; CHECK-LABEL: func_smin_var_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i16 @llvm.smin.i16(i16 %0, i16 %1)
+  ret i16 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smin_var_i32(i32 noundef signext %0, i32 noundef signext %1) {
+; CHECK-LABEL: func_smin_var_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i32 @llvm.smin.i32(i32 %0, i32 %1)
+  ret i32 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smin_var_i64(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: func_smin_var_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.l %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i64 @llvm.smin.i64(i64 %0, i64 %1)
+  ret i64 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smin_var_i128(i128 noundef %0, i128 noundef %1) {
+; CHECK-LABEL: func_smin_var_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s5, %s1, %s3
+; CHECK-NEXT:    or %s4, 0, %s2
+; CHECK-NEXT:    cmov.l.lt %s4, %s0, %s5
+; CHECK-NEXT:    cmpu.l %s6, %s0, %s2
+; CHECK-NEXT:    cmov.l.lt %s2, %s0, %s6
+; CHECK-NEXT:    cmov.l.eq %s4, %s2, %s5
+; CHECK-NEXT:    mins.l %s1, %s1, %s3
+; CHECK-NEXT:    or %s0, 0, %s4
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i128 @llvm.smin.i128(i128 %0, i128 %1)
+  ret i128 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smin_fore_zero_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smin_fore_zero_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smin.i8(i8 %0, i8 0)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smin_fore_zero_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smin_fore_zero_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smin.i16(i16 %0, i16 0)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smin_fore_zero_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smin_fore_zero_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smin.i32(i32 %0, i32 0)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smin_fore_zero_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smin_fore_zero_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.l %s0, 0, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smin.i64(i64 %0, i64 0)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smin_fore_zero_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smin_fore_zero_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s2, 0, (0)1
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    sra.l %s4, %s1, 63
+; CHECK-NEXT:    and %s0, %s4, %s0
+; CHECK-NEXT:    cmov.l.eq %s0, %s2, %s3
+; CHECK-NEXT:    mins.l %s1, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smin.i128(i128 %0, i128 0)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smin_back_zero_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smin_back_zero_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smin.i8(i8 %0, i8 0)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smin_back_zero_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smin_back_zero_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smin.i16(i16 %0, i16 0)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smin_back_zero_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smin_back_zero_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, 0, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smin.i32(i32 %0, i32 0)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smin_back_zero_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smin_back_zero_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.l %s0, 0, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smin.i64(i64 %0, i64 0)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smin_back_zero_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smin_back_zero_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s2, 0, (0)1
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    sra.l %s4, %s1, 63
+; CHECK-NEXT:    and %s0, %s4, %s0
+; CHECK-NEXT:    cmov.l.eq %s0, %s2, %s3
+; CHECK-NEXT:    mins.l %s1, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smin.i128(i128 %0, i128 0)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smin_fore_const_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smin_fore_const_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, -1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smin.i8(i8 %0, i8 -1)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smin_fore_const_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smin_fore_const_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smin.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smin_fore_const_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smin_fore_const_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smin.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smin_fore_const_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smin_fore_const_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.l %s0, %s0, (56)0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smin.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smin_fore_const_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smin_fore_const_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmov.l.lt %s2, %s0, %s3
+; CHECK-NEXT:    cmpu.l %s5, %s0, (56)0
+; CHECK-NEXT:    cmov.l.lt %s4, %s0, %s5
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s3
+; CHECK-NEXT:    mins.l %s1, 0, %s1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smin.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i8 @func_smin_back_const_i8(i8 noundef signext %0) {
+; CHECK-LABEL: func_smin_back_const_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, -1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i8 @llvm.smin.i8(i8 %0, i8 -1)
+  ret i8 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i16 @func_smin_back_const_i16(i16 noundef signext %0) {
+; CHECK-LABEL: func_smin_back_const_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.smin.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define signext i32 @func_smin_back_const_i32(i32 noundef signext %0) {
+; CHECK-LABEL: func_smin_back_const_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.smin.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_smin_back_const_i64(i64 noundef %0) {
+; CHECK-LABEL: func_smin_back_const_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.l %s0, %s0, (56)0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.smin.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_smin_back_const_i128(i128 noundef %0) {
+; CHECK-LABEL: func_smin_back_const_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmov.l.lt %s2, %s0, %s3
+; CHECK-NEXT:    cmpu.l %s5, %s0, (56)0
+; CHECK-NEXT:    cmov.l.lt %s4, %s0, %s5
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s3
+; CHECK-NEXT:    mins.l %s1, 0, %s1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.smin.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smin.i32(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i8 @llvm.smin.i8(i8, i8)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i16 @llvm.smin.i16(i16, i16)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i64 @llvm.smin.i64(i64, i64)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i128 @llvm.smin.i128(i128, i128)
diff --git a/llvm/test/CodeGen/VE/Scalar/umax.ll b/llvm/test/CodeGen/VE/Scalar/umax.ll
new file mode 100644
index 00000000000000..3df721fc789a64
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/umax.ll
@@ -0,0 +1,358 @@
+; RUN: llc < %s -mtriple=ve | FileCheck %s
+
+;;; Test ‘llvm.umax.*’ intrinsic
+;;;
+;;; Syntax:
+;;;   This is an overloaded intrinsic. You can use @llvm.umax on any
+;;;   integer bit width or any vector of integer elements.
+;;;
+;;; declare i32 @llvm.umax.i32(i32 %a, i32 %b)
+;;; declare <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
+;;;
+;;; Overview:
+;;;   Return the larger of %a and %b comparing the values as unsigned
+;;;   integers. Vector intrinsics operate on a per-element basis. The
+;;;   larger element of %a and %b at a given index is returned for
+;;;   that index.
+;;;
+;;; Arguments:
+;;;   The arguments (%a and %b) may be of any integer type or a vector
+;;;   with integer element type. The argument types must match each
+;;;   other, and the return type must match the argument type.
+;;;
+;;; Note:
+;;;   We test only i1/u8/u16/u32/u64/u128.
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i1 @func_umax_var_i1(i1 noundef zeroext %0, i1 noundef zeroext %1) {
+; CHECK-LABEL: func_umax_var_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i1 @llvm.umax.i1(i1 %0, i1 %1)
+  ret i1 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i8 @func_umax_var_u8(i8 noundef zeroext %0, i8 noundef zeroext %1) {
+; CHECK-LABEL: func_umax_var_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i8 @llvm.umax.i8(i8 %0, i8 %1)
+  ret i8 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i16 @func_umax_var_u16(i16 noundef zeroext %0, i16 noundef zeroext %1) {
+; CHECK-LABEL: func_umax_var_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i16 @llvm.umax.i16(i16 %0, i16 %1)
+  ret i16 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i32 @func_umax_var_u32(i32 noundef zeroext %0, i32 noundef zeroext %1) {
+; CHECK-LABEL: func_umax_var_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpu.w %s2, %s0, %s1
+; CHECK-NEXT:    cmov.w.gt %s1, %s0, %s2
+; CHECK-NEXT:    adds.w.zx %s0, %s1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i32 @llvm.umax.i32(i32 %0, i32 %1)
+  ret i32 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_umax_var_u64(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: func_umax_var_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpu.l %s2, %s0, %s1
+; CHECK-NEXT:    cmov.l.gt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i64 @llvm.umax.i64(i64 %0, i64 %1)
+  ret i64 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_umax_var_u128(i128 noundef %0, i128 noundef %1) {
+; CHECK-LABEL: func_umax_var_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpu.l %s5, %s1, %s3
+; CHECK-NEXT:    or %s4, 0, %s2
+; CHECK-NEXT:    cmov.l.gt %s4, %s0, %s5
+; CHECK-NEXT:    cmpu.l %s6, %s0, %s2
+; CHECK-NEXT:    cmov.l.gt %s2, %s0, %s6
+; CHECK-NEXT:    cmps.l %s0, %s1, %s3
+; CHECK-NEXT:    cmov.l.eq %s4, %s2, %s0
+; CHECK-NEXT:    cmov.l.gt %s3, %s1, %s5
+; CHECK-NEXT:    or %s0, 0, %s4
+; CHECK-NEXT:    or %s1, 0, %s3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i128 @llvm.umax.i128(i128 %0, i128 %1)
+  ret i128 %3
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umax_fore_zero_i1(i1 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_fore_zero_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umax_fore_zero_u8(i8 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_fore_zero_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i16 @func_umax_fore_zero_u16(i16 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_fore_zero_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i16 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i32 @func_umax_fore_zero_u32(i32 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_fore_zero_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i32 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i64 @func_umax_fore_zero_u64(i64 noundef returned %0) {
+; CHECK-LABEL: func_umax_fore_zero_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i64 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i128 @func_umax_fore_zero_u128(i128 noundef returned %0) {
+; CHECK-LABEL: func_umax_fore_zero_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i128 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umax_back_zero_i1(i1 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_back_zero_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umax_back_zero_u8(i8 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_back_zero_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i16 @func_umax_back_zero_u16(i16 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_back_zero_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i16 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i32 @func_umax_back_zero_u32(i32 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umax_back_zero_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i32 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i64 @func_umax_back_zero_u64(i64 noundef returned %0) {
+; CHECK-LABEL: func_umax_back_zero_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i64 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i128 @func_umax_back_zero_u128(i128 noundef returned %0) {
+; CHECK-LABEL: func_umax_back_zero_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i128 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umax_fore_const_i1(i1 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_fore_const_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 true
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umax_fore_const_u8(i8 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_fore_const_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 255
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 -1
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i16 @func_umax_fore_const_u16(i16 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_fore_const_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.umax.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i32 @func_umax_fore_const_u32(i32 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_fore_const_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.w %s2, %s0, %s1
+; CHECK-NEXT:    cmov.w.gt %s1, %s0, %s2
+; CHECK-NEXT:    adds.w.zx %s0, %s1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.umax.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_umax_fore_const_u64(i64 noundef %0) {
+; CHECK-LABEL: func_umax_fore_const_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.l %s2, %s0, (56)0
+; CHECK-NEXT:    cmov.l.gt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.umax.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_umax_fore_const_u128(i128 noundef %0) {
+; CHECK-LABEL: func_umax_fore_const_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmov.l.ne %s2, %s0, %s3
+; CHECK-NEXT:    cmpu.l %s5, %s0, (56)0
+; CHECK-NEXT:    cmov.l.gt %s4, %s0, %s5
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s3
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.umax.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umax_back_const_i1(i1 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_back_const_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 true
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umax_back_const_u8(i8 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_back_const_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, 255
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 -1
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i16 @func_umax_back_const_u16(i16 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_back_const_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxs.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.umax.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i32 @func_umax_back_const_u32(i32 noundef zeroext %0) {
+; CHECK-LABEL: func_umax_back_const_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.w %s2, %s0, %s1
+; CHECK-NEXT:    cmov.w.gt %s1, %s0, %s2
+; CHECK-NEXT:    adds.w.zx %s0, %s1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.umax.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_umax_back_const_u64(i64 noundef %0) {
+; CHECK-LABEL: func_umax_back_const_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.l %s2, %s0, (56)0
+; CHECK-NEXT:    cmov.l.gt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.umax.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_umax_back_const_u128(i128 noundef %0) {
+; CHECK-LABEL: func_umax_back_const_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmps.l %s3, %s1, (0)1
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmov.l.ne %s2, %s0, %s3
+; CHECK-NEXT:    cmpu.l %s5, %s0, (56)0
+; CHECK-NEXT:    cmov.l.gt %s4, %s0, %s5
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s3
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.umax.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.umax.i32(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i1 @llvm.umax.i1(i1, i1)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i8 @llvm.umax.i8(i8, i8)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i16 @llvm.umax.i16(i16, i16)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i64 @llvm.umax.i64(i64, i64)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i128 @llvm.umax.i128(i128, i128)
diff --git a/llvm/test/CodeGen/VE/Scalar/umin.ll b/llvm/test/CodeGen/VE/Scalar/umin.ll
new file mode 100644
index 00000000000000..937fa420c8a346
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Scalar/umin.ll
@@ -0,0 +1,368 @@
+; RUN: llc < %s -mtriple=ve | FileCheck %s
+
+;;; Test ‘llvm.umin.*’ intrinsic
+;;;
+;;; Syntax:
+;;;   This is an overloaded intrinsic. You can use @llvm.umin on any
+;;;   integer bit width or any vector of integer elements.
+;;;
+;;; declare i32 @llvm.umin.i32(i32 %a, i32 %b)
+;;; declare <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
+;;;
+;;; Overview:
+;;;   Return the smaller of %a and %b comparing the values as unsigned
+;;;   integers. Vector intrinsics operate on a per-element basis. The
+;;;   smaller element of %a and %b at a given index is returned for
+;;;   that index.
+;;;
+;;; Arguments:
+;;;   The arguments (%a and %b) may be of any integer type or a vector
+;;;   with integer element type. The argument types must match each
+;;;   other, and the return type must match the argument type.
+;;;
+;;; Note:
+;;;   We test only i1/u8/u16/u32/u64/u128.
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i1 @func_umin_var_i1(i1 noundef zeroext %0, i1 noundef zeroext %1) {
+; CHECK-LABEL: func_umin_var_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i1 @llvm.umin.i1(i1 %0, i1 %1)
+  ret i1 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i8 @func_umin_var_u8(i8 noundef zeroext %0, i8 noundef zeroext %1) {
+; CHECK-LABEL: func_umin_var_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i8 @llvm.umin.i8(i8 %0, i8 %1)
+  ret i8 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i16 @func_umin_var_u16(i16 noundef zeroext %0, i16 noundef zeroext %1) {
+; CHECK-LABEL: func_umin_var_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i16 @llvm.umin.i16(i16 %0, i16 %1)
+  ret i16 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i32 @func_umin_var_u32(i32 noundef zeroext %0, i32 noundef zeroext %1) {
+; CHECK-LABEL: func_umin_var_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpu.w %s2, %s0, %s1
+; CHECK-NEXT:    cmov.w.lt %s1, %s0, %s2
+; CHECK-NEXT:    adds.w.zx %s0, %s1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i32 @llvm.umin.i32(i32 %0, i32 %1)
+  ret i32 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_umin_var_u64(i64 noundef %0, i64 noundef %1) {
+; CHECK-LABEL: func_umin_var_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpu.l %s2, %s0, %s1
+; CHECK-NEXT:    cmov.l.lt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i64 @llvm.umin.i64(i64 %0, i64 %1)
+  ret i64 %3
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_umin_var_u128(i128 noundef %0, i128 noundef %1) {
+; CHECK-LABEL: func_umin_var_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpu.l %s5, %s1, %s3
+; CHECK-NEXT:    or %s4, 0, %s2
+; CHECK-NEXT:    cmov.l.lt %s4, %s0, %s5
+; CHECK-NEXT:    cmpu.l %s6, %s0, %s2
+; CHECK-NEXT:    cmov.l.lt %s2, %s0, %s6
+; CHECK-NEXT:    cmps.l %s0, %s1, %s3
+; CHECK-NEXT:    cmov.l.eq %s4, %s2, %s0
+; CHECK-NEXT:    cmov.l.lt %s3, %s1, %s5
+; CHECK-NEXT:    or %s0, 0, %s4
+; CHECK-NEXT:    or %s1, 0, %s3
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = tail call i128 @llvm.umin.i128(i128 %0, i128 %1)
+  ret i128 %3
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umin_fore_zero_i1(i1 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_fore_zero_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 false
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umin_fore_zero_u8(i8 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_fore_zero_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i16 @func_umin_fore_zero_u16(i16 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_fore_zero_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i16 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i32 @func_umin_fore_zero_u32(i32 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_fore_zero_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i32 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i64 @func_umin_fore_zero_u64(i64 noundef %0) {
+; CHECK-LABEL: func_umin_fore_zero_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i64 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i128 @func_umin_fore_zero_u128(i128 noundef %0) {
+; CHECK-LABEL: func_umin_fore_zero_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    or %s1, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i128 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umin_back_zero_i1(i1 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_back_zero_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 false
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umin_back_zero_u8(i8 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_back_zero_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i16 @func_umin_back_zero_u16(i16 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_back_zero_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i16 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i32 @func_umin_back_zero_u32(i32 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_back_zero_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i32 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i64 @func_umin_back_zero_u64(i64 noundef %0) {
+; CHECK-LABEL: func_umin_back_zero_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i64 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define i128 @func_umin_back_zero_u128(i128 noundef %0) {
+; CHECK-LABEL: func_umin_back_zero_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    or %s1, 0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i128 0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umin_fore_const_i1(i1 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umin_fore_const_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umin_fore_const_u8(i8 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umin_fore_const_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 %0
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i16 @func_umin_fore_const_u16(i16 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_fore_const_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.umin.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i32 @func_umin_fore_const_u32(i32 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_fore_const_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.w %s2, %s0, %s1
+; CHECK-NEXT:    cmov.w.lt %s1, %s0, %s2
+; CHECK-NEXT:    adds.w.zx %s0, %s1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.umin.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_umin_fore_const_u64(i64 noundef %0) {
+; CHECK-LABEL: func_umin_fore_const_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.l %s2, %s0, (56)0
+; CHECK-NEXT:    cmov.l.lt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.umin.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_umin_fore_const_u128(i128 noundef %0) {
+; CHECK-LABEL: func_umin_fore_const_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmpu.l %s3, %s0, (56)0
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    cmov.l.lt %s4, %s0, %s3
+; CHECK-NEXT:    cmps.l %s0, %s1, (0)1
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s0
+; CHECK-NEXT:    or %s1, 0, (0)1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.umin.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i1 @func_umin_back_const_i1(i1 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umin_back_const_i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i1 %0
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define zeroext i8 @func_umin_back_const_u8(i8 noundef returned zeroext %0) {
+; CHECK-LABEL: func_umin_back_const_u8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    b.l.t (, %s10)
+  ret i8 %0
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i16 @func_umin_back_const_u16(i16 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_back_const_u16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mins.w.sx %s0, %s0, (56)0
+; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i16 @llvm.umin.i16(i16 %0, i16 255)
+  ret i16 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define zeroext i32 @func_umin_back_const_u32(i32 noundef zeroext %0) {
+; CHECK-LABEL: func_umin_back_const_u32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.w %s2, %s0, %s1
+; CHECK-NEXT:    cmov.w.lt %s1, %s0, %s2
+; CHECK-NEXT:    adds.w.zx %s0, %s1, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i32 @llvm.umin.i32(i32 %0, i32 255)
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i64 @func_umin_back_const_u64(i64 noundef %0) {
+; CHECK-LABEL: func_umin_back_const_u64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 255
+; CHECK-NEXT:    cmpu.l %s2, %s0, (56)0
+; CHECK-NEXT:    cmov.l.lt %s1, %s0, %s2
+; CHECK-NEXT:    or %s0, 0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i64 @llvm.umin.i64(i64 %0, i64 255)
+  ret i64 %2
+}
+
+; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
+define i128 @func_umin_back_const_u128(i128 noundef %0) {
+; CHECK-LABEL: func_umin_back_const_u128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s2, 255
+; CHECK-NEXT:    cmpu.l %s3, %s0, (56)0
+; CHECK-NEXT:    lea %s4, 255
+; CHECK-NEXT:    cmov.l.lt %s4, %s0, %s3
+; CHECK-NEXT:    cmps.l %s0, %s1, (0)1
+; CHECK-NEXT:    cmov.l.eq %s2, %s4, %s0
+; CHECK-NEXT:    or %s1, 0, (0)1
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:    b.l.t (, %s10)
+  %2 = tail call i128 @llvm.umin.i128(i128 %0, i128 255)
+  ret i128 %2
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.umin.i32(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i1 @llvm.umin.i1(i1, i1)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i8 @llvm.umin.i8(i8, i8)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i16 @llvm.umin.i16(i16, i16)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i64 @llvm.umin.i64(i64, i64)
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare i128 @llvm.umin.i128(i128, i128)
diff --git a/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
new file mode 100644
index 00000000000000..2bf507bfe5d892
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/laa-invalidation.ll
@@ -0,0 +1,34 @@
+; RUN: opt  -passes='loop-load-elim,indvars,loop-distribute' -enable-loop-distribute %s
+
+; REQUIRES: asserts
+; XFAIL: *
+
+define void @test_pr50940(ptr %A, ptr %B) {
+entry:
+  %gep.A.1 = getelementptr inbounds i16, ptr %A, i64 1
+  br label %outer.header
+
+outer.header:
+  %gep.A.2 = getelementptr inbounds i16, ptr %gep.A.1, i64 1
+  br i1 false, label %outer.latch, label %inner.ph
+
+inner.ph:                             ; preds = %for.body5
+  %lcssa.gep = phi ptr [ %gep.A.2, %outer.header ]
+  %gep.A.3 = getelementptr inbounds i16, ptr %A, i64 3
+  br label %inner
+
+inner:
+  %iv = phi i16 [ 0, %inner.ph ], [ %iv.next, %inner ]
+  %l = load <2 x i16>, ptr %lcssa.gep, align 1
+  store i16 0, ptr %gep.A.3, align 1
+  store i16 1, ptr %B, align 1
+  %iv.next = add nuw nsw i16 %iv, 1
+  %c.1 = icmp ult i16 %iv, 38
+  br i1 %c.1, label %inner, label %exit
+
+outer.latch:
+  br label %outer.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/tools/llvm-readobj/COFF/exports-forwarder.yaml b/llvm/test/tools/llvm-readobj/COFF/exports-forwarder.yaml
new file mode 100644
index 00000000000000..61f8ab1e595d4d
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/COFF/exports-forwarder.yaml
@@ -0,0 +1,52 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj --coff-exports %t | FileCheck %s
+
+# CHECK:      Export {
+# CHECK-NEXT:   Ordinal: 1
+# CHECK-NEXT:   Name: LoadLibrary
+# CHECK-NEXT:   ForwardedTo: kernel32.LoadLibrary
+# CHECK-NEXT: }
+
+# Test file generated with:
+#   clang -O2 --target=x86_64-windows-msvc test.c -nostdlib -c -o test.obj
+#   lld-link -dll -out:test.dll -entry:entry -export:LoadLibrary=kernel32.LoadLibrary test.obj
+# test.c:
+#   void entry(void) {}
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4096
+  ImageBase:       6442450944
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_GUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA, IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:
+    RelativeVirtualAddress: 8192
+    Size:            110
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE, IMAGE_FILE_DLL ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     1
+    SectionData:     C3
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     110
+    SectionData:     0000000000000000000000002820000001000000010000000100000043200000472000004B2000006578706F72742D666F727761726465722E632E746D702E646C6C00592000004D20000000004C6F61644C696272617279006B65726E656C33322E4C6F61644C69627261727900
+symbols:         []
+...
diff --git a/llvm/tools/llvm-readobj/COFFDumper.cpp b/llvm/tools/llvm-readobj/COFFDumper.cpp
index 56c5d9c0ae5387..4ae9d131535695 100644
--- a/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -1789,18 +1789,29 @@ void COFFDumper::printCOFFExports() {
     DictScope Export(W, "Export");
 
     StringRef Name;
-    uint32_t Ordinal, RVA;
+    uint32_t Ordinal;
+    bool IsForwarder;
 
     if (Error E = Exp.getSymbolName(Name))
       reportError(std::move(E), Obj->getFileName());
     if (Error E = Exp.getOrdinal(Ordinal))
       reportError(std::move(E), Obj->getFileName());
-    if (Error E = Exp.getExportRVA(RVA))
+    if (Error E = Exp.isForwarder(IsForwarder))
       reportError(std::move(E), Obj->getFileName());
 
     W.printNumber("Ordinal", Ordinal);
     W.printString("Name", Name);
-    W.printHex("RVA", RVA);
+    StringRef ForwardTo;
+    if (IsForwarder) {
+      if (Error E = Exp.getForwardTo(ForwardTo))
+        reportError(std::move(E), Obj->getFileName());
+      W.printString("ForwardedTo", ForwardTo);
+    } else {
+      uint32_t RVA;
+      if (Error E = Exp.getExportRVA(RVA))
+        reportError(std::move(E), Obj->getFileName());
+      W.printHex("RVA", RVA);
+    }
   }
 }
 
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.h b/llvm/tools/llvm-readobj/llvm-readobj.h
index 989cd0aba6c01c..5a9fe28d883e57 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.h
+++ b/llvm/tools/llvm-readobj/llvm-readobj.h
@@ -50,6 +50,6 @@ extern OutputStyleTy Output;
   { #enum, ns::enum }
 
 #define LLVM_READOBJ_ENUM_CLASS_ENT(enum_class, enum) \
-  { #enum, std::underlying_type<enum_class>::type(enum_class::enum) }
+  { #enum, std::underlying_type_t<enum_class>(enum_class::enum) }
 
 #endif
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index 9e54c7745e4dd4..63b161ed73e0b9 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -307,7 +307,7 @@ TEST(LinkGraphTest, MakeAbsolute) {
       << "Unexpected number of external symbols";
 
   // Add an external symbol.
-  auto &S2 = G.addExternalSymbol("S2", 0, Linkage::Strong);
+  auto &S2 = G.addExternalSymbol("S2", 0, true);
 
   EXPECT_TRUE(S2.isExternal()) << "Symbol should be external";
   EXPECT_EQ(
@@ -356,7 +356,7 @@ TEST(LinkGraphTest, MakeDefined) {
   auto &B1 = G.createContentBlock(Sec, BlockContent, B1Addr, 8, 0);
 
   // Add an external symbol.
-  auto &S1 = G.addExternalSymbol("S1", 4, Linkage::Strong);
+  auto &S1 = G.addExternalSymbol("S1", 4, true);
 
   EXPECT_FALSE(S1.isDefined()) << "Symbol should not be defined";
   EXPECT_TRUE(S1.isExternal()) << "Symbol should be external";
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index dbdc72f0873a1d..83ac7173441e61 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -50,7 +50,7 @@ using TreePatternNodePtr = std::shared_ptr<TreePatternNode>;
 /// To reduce the allocations even further, make MachineValueTypeSet own
 /// the storage and use std::array as the bit container.
 struct MachineValueTypeSet {
-  static_assert(std::is_same<std::underlying_type<MVT::SimpleValueType>::type,
+  static_assert(std::is_same<std::underlying_type_t<MVT::SimpleValueType>,
                              uint8_t>::value,
                 "Change uint8_t here to the SimpleValueType's type");
   static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max()+1;
diff --git a/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h b/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h
index 906c1152bd2d03..34d01392e2606d 100644
--- a/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h
+++ b/mlir/include/mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h
@@ -16,15 +16,13 @@ class Pass;
 class LLVMTypeConverter;
 class RewritePatternSet;
 
-#define GEN_PASS_DECL_CONVERTMEMREFTOLLVM
+#define GEN_PASS_DECL_MEMREFTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
 
 /// Collect a set of patterns to convert memory-related operations from the
 /// MemRef dialect to the LLVM dialect.
 void populateMemRefToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                             RewritePatternSet &patterns);
-
-std::unique_ptr<Pass> createMemRefToLLVMPass();
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_MEMREFTOLLVM_MEMREFTOLLVM_H
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 6163c6ae5d0cb9..5f48481647935f 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -541,10 +541,9 @@ def ConvertMathToFuncs : Pass<"convert-math-to-funcs", "ModuleOp"> {
 // MemRefToLLVM
 //===----------------------------------------------------------------------===//
 
-def ConvertMemRefToLLVM : Pass<"convert-memref-to-llvm", "ModuleOp"> {
+def MemRefToLLVMConversionPass : Pass<"convert-memref-to-llvm", "ModuleOp"> {
   let summary = "Convert operations from the MemRef dialect to the LLVM "
                 "dialect";
-  let constructor = "mlir::createMemRefToLLVMPass()";
   let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"useAlignedAlloc", "use-aligned-alloc", "bool", /*default=*/"false",
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
index 422a101040e156..4fb212024b1c1b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -130,7 +130,7 @@ class OneShotAnalysisState : public AnalysisState {
 
   OneShotAnalysisState(const OneShotAnalysisState &) = delete;
 
-  virtual ~OneShotAnalysisState() = default;
+  ~OneShotAnalysisState() override = default;
 
   /// Return a reference to the BufferizationAliasInfo.
   BufferizationAliasInfo &getAliasInfo() { return aliasInfo; }
diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt
index 270cd54cca96f3..22505ba06e521f 100644
--- a/mlir/include/mlir/Dialect/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -1,10 +1,10 @@
-add_subdirectory(Affine)
 add_subdirectory(AMDGPU)
+add_subdirectory(AMX)
+add_subdirectory(Affine)
 add_subdirectory(Arithmetic)
-add_subdirectory(Async)
 add_subdirectory(ArmNeon)
 add_subdirectory(ArmSVE)
-add_subdirectory(AMX)
+add_subdirectory(Async)
 add_subdirectory(Bufferization)
 add_subdirectory(Complex)
 add_subdirectory(ControlFlow)
@@ -12,11 +12,11 @@ add_subdirectory(DLTI)
 add_subdirectory(EmitC)
 add_subdirectory(Func)
 add_subdirectory(GPU)
-add_subdirectory(Math)
-add_subdirectory(Linalg)
 add_subdirectory(LLVMIR)
-add_subdirectory(MemRef)
+add_subdirectory(Linalg)
 add_subdirectory(MLProgram)
+add_subdirectory(Math)
+add_subdirectory(MemRef)
 add_subdirectory(NVGPU)
 add_subdirectory(OpenACC)
 add_subdirectory(OpenMP)
@@ -24,11 +24,12 @@ add_subdirectory(PDL)
 add_subdirectory(PDLInterp)
 add_subdirectory(Quant)
 add_subdirectory(SCF)
+add_subdirectory(SPIRV)
 add_subdirectory(Shape)
 add_subdirectory(SparseTensor)
-add_subdirectory(SPIRV)
 add_subdirectory(Tensor)
 add_subdirectory(Tosa)
 add_subdirectory(Transform)
+add_subdirectory(Utils)
 add_subdirectory(Vector)
 add_subdirectory(X86Vector)
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
index 35840bdce8d2d6..f5d48b2ebcefe5 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt
@@ -44,7 +44,7 @@ add_dependencies(mlir-headers LinalgOdsGen)
 
 add_mlir_dialect(LinalgOps linalg)
 
-set(LLVM_TARGET_DEFINITIONS LinalgOps.td)
+set(LLVM_TARGET_DEFINITIONS LinalgEnums.td)
 mlir_tablegen(LinalgOpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(LinalgOpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRLinalgOpsEnumsIncGen)
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
index 87108a9ef8e3aa..f55f093e7b1fc9 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td
@@ -13,6 +13,7 @@
 #ifndef LINALG_BASE
 #define LINALG_BASE
 
+include "mlir/Dialect/Linalg/IR/LinalgEnums.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
 
@@ -62,37 +63,6 @@ def Linalg_Dialect : Dialect {
 }
 
 // Define the function attribute enums matching the OpDSL functions.
-def UnaryFn : I32EnumAttr<"UnaryFn", "", [
-  I32EnumAttrCase<"exp", 0>,
-  I32EnumAttrCase<"log", 1>,
-  I32EnumAttrCase<"abs", 2>,
-  I32EnumAttrCase<"ceil", 3>,
-  I32EnumAttrCase<"floor", 4>,
-  I32EnumAttrCase<"negf", 5>
-]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::linalg";
-}
-def BinaryFn : I32EnumAttr<"BinaryFn", "", [
-  I32EnumAttrCase<"add", 0>,
-  I32EnumAttrCase<"sub", 1>,
-  I32EnumAttrCase<"mul", 2>,
-  I32EnumAttrCase<"max_signed", 3>,
-  I32EnumAttrCase<"min_signed", 4>,
-  I32EnumAttrCase<"max_unsigned", 5>,
-  I32EnumAttrCase<"min_unsigned", 6>
-]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::linalg";
-}
-def TypeFn : I32EnumAttr<"TypeFn", "", [
-  I32EnumAttrCase<"cast_signed", 0>,
-  I32EnumAttrCase<"cast_unsigned", 1>
-]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::linalg";
-}
-
 def UnaryFnAttr : EnumAttr<Linalg_Dialect, UnaryFn, "unary_fn"> {
   let assemblyFormat = "`<` $value `>`";
 }
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
new file mode 100644
index 00000000000000..6d50cda9718625
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
@@ -0,0 +1,50 @@
+//===- LinalgBase.td - Linalg dialect base support ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the definition file for enums used in linear algebra operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LINALG_ENUMS
+#define LINALG_ENUMS
+
+include "mlir/IR/EnumAttr.td"
+
+// Define the function attribute enums matching the OpDSL functions.
+def UnaryFn : I32EnumAttr<"UnaryFn", "", [
+  I32EnumAttrCase<"exp", 0>,
+  I32EnumAttrCase<"log", 1>,
+  I32EnumAttrCase<"abs", 2>,
+  I32EnumAttrCase<"ceil", 3>,
+  I32EnumAttrCase<"floor", 4>,
+  I32EnumAttrCase<"negf", 5>
+]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::linalg";
+}
+def BinaryFn : I32EnumAttr<"BinaryFn", "", [
+  I32EnumAttrCase<"add", 0>,
+  I32EnumAttrCase<"sub", 1>,
+  I32EnumAttrCase<"mul", 2>,
+  I32EnumAttrCase<"max_signed", 3>,
+  I32EnumAttrCase<"min_signed", 4>,
+  I32EnumAttrCase<"max_unsigned", 5>,
+  I32EnumAttrCase<"min_unsigned", 6>
+]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::linalg";
+}
+def TypeFn : I32EnumAttr<"TypeFn", "", [
+  I32EnumAttrCase<"cast_signed", 0>,
+  I32EnumAttrCase<"cast_unsigned", 1>
+]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::linalg";
+}
+
+#endif // LINALG_ENUMS
diff --git a/mlir/include/mlir/Dialect/Utils/CMakeLists.txt b/mlir/include/mlir/Dialect/Utils/CMakeLists.txt
new file mode 100644
index 00000000000000..edfb1ca873abe6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Utils/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS StructuredOpsUtils.td)
+mlir_tablegen(DialectUtilsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(DialectUtilsEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRDialectUtilsIncGen)
+add_dependencies(mlir-headers MLIRDialectUtilsIncGen)
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index fd470c1330841d..8f7ac8cc2cee60 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -23,6 +23,9 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/StringRef.h"
 
+// Pull in all enum type definitions and utility function declarations.
+#include "mlir/Dialect/Utils/DialectUtilsEnums.h.inc"
+
 namespace mlir {
 
 class OpBuilder;
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.td b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.td
new file mode 100644
index 00000000000000..4200343ce3e132
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.td
@@ -0,0 +1,23 @@
+//===- StructuredOpsUtils.td - structured ops enums --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef STRUCTURED_OPS_UTILS
+#define STRUCTURED_OPS_UTILS
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/EnumAttr.td"
+
+def IteratorType : I32EnumAttr<"IteratorType", "Iterator type", [
+  I32EnumAttrCase<"parallel", 0>,
+  I32EnumAttrCase<"reduction", 1>
+]> {
+    let genSpecializedAttr = 0;
+    let cppNamespace = "::mlir::utils";
+}
+
+#endif // STRUCTURED_OPS_UTILS
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.h b/mlir/include/mlir/Interfaces/TilingInterface.h
index 51f09307074cae..99cbe21b178ca3 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.h
+++ b/mlir/include/mlir/Interfaces/TilingInterface.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_INTERFACES_TILINGINTERFACE_H_
 #define MLIR_INTERFACES_TILINGINTERFACE_H_
 
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 222c075e4eebf8..b7af9765393654 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -41,13 +41,9 @@ def TilingInterface : OpInterface<"TilingInterface"> {
       >,
       InterfaceMethod<
         /*desc=*/[{
-          Returns a list of `StringRef`s that describe the number of
-          loops and the iterator types of the operation. The list is
-          expected to use
-          `getParallelIteratorTypeName()`/`getReductionIteratorTypeName()`
-          from MLIR Structured Op Utils.
+          Returns a list of iterator types that describe the number of loops.
         }],
-        /*retType=*/"SmallVector<StringRef>",
+        /*retType=*/"SmallVector<utils::IteratorType>",
         /*methodName=*/"getLoopIteratorTypes",
         /*args=*/(ins),
         /*methodBody=*/"",
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 8fe631b25bad40..c691c07f9bc4d6 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -24,7 +24,7 @@
 #include "llvm/ADT/SmallBitVector.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_CONVERTMEMREFTOLLVM
+#define GEN_PASS_DEF_MEMREFTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
 } // namespace mlir
 
@@ -2108,9 +2108,9 @@ void mlir::populateMemRefToLLVMConversionPatterns(LLVMTypeConverter &converter,
 }
 
 namespace {
-struct MemRefToLLVMPass
-    : public impl::ConvertMemRefToLLVMBase<MemRefToLLVMPass> {
-  MemRefToLLVMPass() = default;
+struct MemRefToLLVMConversionPass
+    : public impl::MemRefToLLVMConversionPassBase<MemRefToLLVMConversionPass> {
+  using MemRefToLLVMConversionPassBase::MemRefToLLVMConversionPassBase;
 
   void runOnOperation() override {
     Operation *op = getOperation();
@@ -2137,7 +2137,3 @@ struct MemRefToLLVMPass
   }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createMemRefToLLVMPass() {
-  return std::make_unique<MemRefToLLVMPass>();
-}
diff --git a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
index bd693478a35333..1891ce813919a8 100644
--- a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
+++ b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
@@ -627,9 +627,21 @@ OpFoldResult arith::XOrIOp::fold(ArrayRef<Attribute> operands) {
   if (getLhs() == getRhs())
     return Builder(getContext()).getZeroAttr(getType());
   /// xor(xor(x, a), a) -> x
-  if (arith::XOrIOp prev = getLhs().getDefiningOp<arith::XOrIOp>())
+  /// xor(xor(a, x), a) -> x
+  if (arith::XOrIOp prev = getLhs().getDefiningOp<arith::XOrIOp>()) {
     if (prev.getRhs() == getRhs())
       return prev.getLhs();
+    if (prev.getLhs() == getRhs())
+      return prev.getRhs();
+  }
+  /// xor(a, xor(x, a)) -> x
+  /// xor(a, xor(a, x)) -> x
+  if (arith::XOrIOp prev = getRhs().getDefiningOp<arith::XOrIOp>()) {
+    if (prev.getRhs() == getLhs())
+      return prev.getLhs();
+    if (prev.getLhs() == getLhs())
+      return prev.getRhs();
+  }
 
   return constFoldBinaryOp<IntegerAttr>(
       operands, [](APInt a, const APInt &b) { return std::move(a) ^ b; });
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 317240089924f9..6e4c2fc9d7393a 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -638,7 +638,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   auto iteratorTypesRange =
       linalgOp.iterator_types().getAsValueRange<StringAttr>();
   for (StringRef iteratorType : iteratorTypesRange) {
-    if (!llvm::is_contained(getAllIteratorTypeNames(), iteratorType))
+    if (!llvm::is_contained(getAllIteratorTypeNames(), iteratorType) ||
+        !utils::symbolizeIteratorType(iteratorType).has_value())
       return op->emitOpError("unexpected iterator_type (")
              << iteratorType << ")";
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index 870f96282edb2e..9d2a105f5ed687 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -90,11 +90,13 @@ struct LinalgOpTilingInterface
   }
 
   /// Return the loop iterator type.
-  SmallVector<StringRef> getLoopIteratorTypes(Operation *op) const {
+  SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
     LinalgOpTy concreteOp = cast<LinalgOpTy>(op);
     return llvm::to_vector(
         llvm::map_range(concreteOp.iterator_types(), [](Attribute strAttr) {
-          return strAttr.cast<StringAttr>().getValue();
+          return utils::symbolizeIteratorType(
+                     strAttr.cast<StringAttr>().getValue())
+              .getValue();
         }));
   }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index fa8209ee94f7e7..4b961a76bf4960 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -76,7 +76,7 @@ void mlir::sparse_tensor::buildSparseCompiler(
   pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
   pm.addPass(createLowerAffinePass());
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
-  pm.addPass(createMemRefToLLVMPass());
+  pm.addPass(createMemRefToLLVMConversionPass());
   pm.addNestedPass<func::FuncOp>(createConvertComplexToStandardPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::arith::createArithmeticExpandOpsPass());
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 1897cde0d425e8..fab2adc4633c7b 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -36,10 +36,10 @@ struct PadOpTiling : public TilingInterface::ExternalModel<PadOpTiling, PadOp> {
     return {initTensor};
   }
 
-  SmallVector<StringRef> getLoopIteratorTypes(Operation *op) const {
+  SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
     auto padOp = cast<PadOp>(op);
-    SmallVector<StringRef> iteratorTypes(padOp.getResultType().getRank(),
-                                         getParallelIteratorTypeName());
+    SmallVector<utils::IteratorType> iteratorTypes(
+        padOp.getResultType().getRank(), utils::IteratorType::parallel);
     return iteratorTypes;
   }
 
diff --git a/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp b/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
index 27b570108011ed..d6c33126edef98 100644
--- a/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
@@ -10,6 +10,8 @@
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinAttributes.h"
 
+#include "mlir/Dialect/Utils/DialectUtilsEnums.cpp.inc"
+
 using namespace mlir;
 
 bool mlir::isRowMajorMatmul(ArrayAttr indexingMaps) {
diff --git a/mlir/test/Dialect/Arithmetic/canonicalize.mlir b/mlir/test/Dialect/Arithmetic/canonicalize.mlir
index 649da010cd3596..632e7af4a26a3e 100644
--- a/mlir/test/Dialect/Arithmetic/canonicalize.mlir
+++ b/mlir/test/Dialect/Arithmetic/canonicalize.mlir
@@ -1585,3 +1585,51 @@ func.func @test_andi_not_fold_lhs(%arg0 : index) -> index {
     %2 = arith.andi %1, %arg0 : index
     return %2 : index
 }
+
+// -----
+/// xor(xor(x, a), a) -> x
+
+// CHECK-LABEL: @xorxor0(
+//       CHECK-NOT: xori
+//       CHECK:   return %arg0
+func.func @xorxor0(%a : i32, %b : i32) -> i32 {
+  %c = arith.xori %a, %b : i32
+  %res = arith.xori %c, %b : i32
+  return %res : i32
+}
+
+// -----
+/// xor(xor(a, x), a) -> x
+
+// CHECK-LABEL: @xorxor1(
+//       CHECK-NOT: xori
+//       CHECK:   return %arg0
+func.func @xorxor1(%a : i32, %b : i32) -> i32 {
+  %c = arith.xori %b, %a : i32
+  %res = arith.xori %c, %b : i32
+  return %res : i32
+}
+
+// -----
+/// xor(a, xor(x, a)) -> x
+
+// CHECK-LABEL: @xorxor2(
+//       CHECK-NOT: xori
+//       CHECK:   return %arg0
+func.func @xorxor2(%a : i32, %b : i32) -> i32 {
+  %c = arith.xori %a, %b : i32
+  %res = arith.xori %b, %c : i32
+  return %res : i32
+}
+
+// -----
+/// xor(a, xor(a, x)) -> x
+
+// CHECK-LABEL: @xorxor3(
+//       CHECK-NOT: xori
+//       CHECK:   return %arg0
+func.func @xorxor3(%a : i32, %b : i32) -> i32 {
+  %c = arith.xori %b, %a : i32
+  %res = arith.xori %b, %c : i32
+  return %res : i32
+}
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 262125e65e5b2d..83d6794e730ef5 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -1187,7 +1187,7 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
     if (!elseElements.empty()) {
       body << " else {\n";
       ArrayRef<FormatElement *> elseElements =
-          optional->getElseElements(/*parsable=*/true);
+          optional->getElseElements(/*parseable=*/true);
       genElementParsers(elseElements.front(), elseElements,
                         /*thenGroup=*/false);
       body << "  }";
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index 8dbbd17ba0a08e..69d77fc7336b76 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -55,7 +55,7 @@ static LogicalResult runMLIRPasses(ModuleOp module) {
 
   passManager.addPass(createConvertGpuLaunchFuncToVulkanLaunchFuncPass());
   LowerToLLVMOptions llvmOptions(module.getContext(), DataLayout(module));
-  passManager.addPass(createMemRefToLLVMPass());
+  passManager.addPass(createMemRefToLLVMConversionPass());
   passManager.nest<func::FuncOp>().addPass(LLVM::createRequestCWrappersPass());
   passManager.addPass(createConvertFuncToLLVMPass(llvmOptions));
   passManager.addPass(createReconcileUnrealizedCastsPass());
diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp
index 60b5be6198ee61..676293a002eea4 100644
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@@ -53,7 +53,7 @@ static struct LLVMInitializer {
 /// dialects lowering to LLVM Dialect.
 static LogicalResult lowerToLLVMDialect(ModuleOp module) {
   PassManager pm(module.getContext());
-  pm.addPass(mlir::createMemRefToLLVMPass());
+  pm.addPass(mlir::createMemRefToLLVMConversionPass());
   pm.addNestedPass<func::FuncOp>(
       mlir::arith::createConvertArithmeticToLLVMPass());
   pm.addPass(mlir::createConvertFuncToLLVMPass());
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index d57eb3df274d55..90aea2c75cc2e6 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -980,6 +980,7 @@ cc_library(
     ],
     textual_hdrs = [
         "src/string/memory_utils/bcmp_implementations.h",
+        "src/string/memory_utils/bzero_implementations.h",
         "src/string/memory_utils/memcmp_implementations.h",
         "src/string/memory_utils/memcpy_implementations.h",
         "src/string/memory_utils/memset_implementations.h",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c400e4529b12f8..8fade09a5448e0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2160,6 +2160,14 @@ gentbl_cc_library(
             ["-gen-op-doc"],
             "g3doc/Dialects/NVGPU/NVGPU.md",
         ),
+        (
+            ["-gen-typedef-decls"],
+            "include/mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc",
+        ),
+        (
+            ["-gen-typedef-defs"],
+            "include/mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/NVGPU/IR/NVGPU.td",
@@ -2298,6 +2306,33 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "DialectUtilsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/Utils/StructuredOpsUtils.td",
+    ],
+    includes = ["include"],
+    deps = [":OpBaseTdFiles"],
+)
+
+gentbl_cc_library(
+    name = "DialectUtilsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            ["-gen-enum-decls"],
+            "include/mlir/Dialect/Utils/DialectUtilsEnums.h.inc",
+        ),
+        (
+            ["-gen-enum-defs"],
+            "include/mlir/Dialect/Utils/DialectUtilsEnums.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Utils/StructuredOpsUtils.td",
+    deps = [":DialectUtilsTdFiles"],
+)
+
 cc_library(
     name = "DialectUtils",
     srcs = glob([
@@ -2309,6 +2344,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":DialectUtilsIncGen",
         ":IR",
         ":Support",
         "//llvm:Support",
@@ -7190,11 +7226,13 @@ td_library(
     name = "LinalgOpsTdFiles",
     srcs = [
         "include/mlir/Dialect/Linalg/IR/LinalgBase.td",
+        "include/mlir/Dialect/Linalg/IR/LinalgEnums.td",
         "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
     ],
     includes = ["include"],
     deps = [
         ":ControlFlowInterfacesTdFiles",
+        ":DialectUtilsTdFiles",
         ":InferTypeOpInterfaceTdFiles",
         ":LoopLikeInterfaceTdFiles",
         ":OpBaseTdFiles",
@@ -7242,14 +7280,6 @@ gentbl_cc_library(
             ],
             "include/mlir/Dialect/Linalg/IR/LinalgOpsDialect.cpp.inc",
         ),
-        (
-            ["-gen-enum-decls"],
-            "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.cpp.inc",
-        ),
         (
             ["-gen-attrdef-decls"],
             "include/mlir/Dialect/Linalg/IR/LinalgOpsAttrDefs.h.inc",
@@ -7264,6 +7294,24 @@ gentbl_cc_library(
     deps = [":LinalgOpsTdFiles"],
 )
 
+gentbl_cc_library(
+    name = "LinalgEnumsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            ["-gen-enum-decls"],
+            "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.h.inc",
+        ),
+        (
+            ["-gen-enum-defs"],
+            "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Linalg/IR/LinalgEnums.td",
+    deps = [":LinalgOpsTdFiles"],
+)
+
 gentbl_cc_library(
     name = "LinalgTransformOpsIncGen",
     strip_include_prefix = "include",
@@ -7519,6 +7567,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":LinalgEnumsIncGen",
         ":LinalgInterfacesIncGen",
         ":LinalgNamedStructuredOpsYamlIncGen",
         ":LinalgOpsIncGen",
@@ -7709,6 +7758,7 @@ cc_library(
     hdrs = ["include/mlir/Interfaces/TilingInterface.h"],
     includes = ["include"],
     deps = [
+        ":DialectUtils",
         ":IR",
         ":Support",
         ":TilingInterfaceIncGen",